howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26 27from howard.functions.commons import * 28from howard.objects.database import * 29from howard.functions.databases import * 30from howard.functions.utils import * 31 32 33class Variants: 34 35 def __init__( 36 self, 37 conn=None, 38 input: str = None, 39 output: str = None, 40 config: dict = {}, 41 param: dict = {}, 42 load: bool = False, 43 ) -> None: 44 """ 45 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 46 header 47 48 :param conn: the connection to the database 49 :param input: the input file 50 :param output: the output file 51 :param config: a dictionary containing the configuration of the model 52 :param param: a dictionary containing the parameters of the model 53 """ 54 55 # Init variables 56 self.init_variables() 57 58 # Input 59 self.set_input(input) 60 61 # Config 62 self.set_config(config) 63 64 # Param 65 self.set_param(param) 66 67 # Output 68 self.set_output(output) 69 70 # connexion 71 self.set_connexion(conn) 72 73 # Header 74 self.set_header() 75 76 # Load data 77 if load: 78 self.load_data() 79 80 def set_input(self, input: str = None) -> None: 81 """ 82 The function `set_input` takes a file name as input, extracts the name and extension, and sets 83 attributes in the class accordingly. 84 85 :param input: The `set_input` method in the provided code snippet is used to set attributes 86 related to the input file. Here's a breakdown of the parameters and their usage in the method: 87 :type input: str 88 """ 89 90 if input and not isinstance(input, str): 91 try: 92 self.input = input.name 93 except: 94 log.error(f"Input file '{input} in bad format") 95 raise ValueError(f"Input file '{input} in bad format") 96 else: 97 self.input = input 98 99 # Input format 100 if input: 101 input_name, input_extension = os.path.splitext(self.input) 102 self.input_name = input_name 103 self.input_extension = input_extension 104 self.input_format = self.input_extension.replace(".", "") 105 106 def set_config(self, config: dict) -> None: 107 """ 108 The set_config function takes a config object and assigns it as the configuration object for the 109 class. 110 111 :param config: The `config` parameter in the `set_config` function is a dictionary object that 112 contains configuration settings for the class. When you call the `set_config` function with a 113 dictionary object as the argument, it will set that dictionary as the configuration object for 114 the class 115 :type config: dict 116 """ 117 118 self.config = config 119 120 def set_param(self, param: dict) -> None: 121 """ 122 This function sets a parameter object for the class based on the input dictionary. 123 124 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 125 as the `param` attribute of the class instance 126 :type param: dict 127 """ 128 129 self.param = param 130 131 def init_variables(self) -> None: 132 """ 133 This function initializes the variables that will be used in the rest of the class 134 """ 135 136 self.prefix = "howard" 137 self.table_variants = "variants" 138 self.dataframe = None 139 140 self.comparison_map = { 141 "gt": ">", 142 "gte": ">=", 143 "lt": "<", 144 "lte": "<=", 145 "equals": "=", 146 "contains": "SIMILAR TO", 147 } 148 149 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 150 151 self.code_type_map_to_sql = { 152 "Integer": "INTEGER", 153 "String": "VARCHAR", 154 "Float": "FLOAT", 155 "Flag": "VARCHAR", 156 } 157 158 self.index_additionnal_fields = [] 159 160 def get_indexing(self) -> bool: 161 """ 162 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 163 returns False. 164 :return: The value of the indexing parameter. 165 """ 166 167 return self.get_param().get("indexing", False) 168 169 def get_connexion_config(self) -> dict: 170 """ 171 The function `get_connexion_config` returns a dictionary containing the configuration for a 172 connection, including the number of threads and memory limit. 173 :return: a dictionary containing the configuration for the Connexion library. 174 """ 175 176 # config 177 config = self.get_config() 178 179 # Connexion config 180 connexion_config = {} 181 threads = self.get_threads() 182 183 # Threads 184 if threads: 185 connexion_config["threads"] = threads 186 187 # Memory 188 # if config.get("memory", None): 189 # connexion_config["memory_limit"] = config.get("memory") 190 if self.get_memory(): 191 connexion_config["memory_limit"] = self.get_memory() 192 193 # Temporary directory 194 if config.get("tmp", None): 195 connexion_config["temp_directory"] = config.get("tmp") 196 197 # Access 198 if config.get("access", None): 199 access = config.get("access") 200 if access in ["RO"]: 201 access = "READ_ONLY" 202 elif access in ["RW"]: 203 access = "READ_WRITE" 204 connexion_db = self.get_connexion_db() 205 if connexion_db in ":memory:": 206 access = "READ_WRITE" 207 connexion_config["access_mode"] = access 208 209 return connexion_config 210 211 def get_duckdb_settings(self) -> dict: 212 """ 213 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 214 string. 215 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 216 """ 217 218 # config 219 config = self.get_config() 220 221 # duckdb settings 222 duckdb_settings_dict = {} 223 if config.get("duckdb_settings", None): 224 duckdb_settings = config.get("duckdb_settings") 225 duckdb_settings = full_path(duckdb_settings) 226 # duckdb setting is a file 227 if os.path.exists(duckdb_settings): 228 with open(duckdb_settings) as json_file: 229 duckdb_settings_dict = yaml.safe_load(json_file) 230 # duckdb settings is a string 231 else: 232 duckdb_settings_dict = json.loads(duckdb_settings) 233 234 return duckdb_settings_dict 235 236 def set_connexion_db(self) -> str: 237 """ 238 The function `set_connexion_db` returns the appropriate database connection string based on the 239 input format and connection type. 240 :return: the value of the variable `connexion_db`. 241 """ 242 243 # Default connexion db 244 default_connexion_db = ":memory:" 245 246 # Find connexion db 247 if self.get_input_format() in ["db", "duckdb"]: 248 connexion_db = self.get_input() 249 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 250 connexion_db = default_connexion_db 251 elif self.get_connexion_type() in ["tmpfile"]: 252 tmp_name = tempfile.mkdtemp( 253 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 254 ) 255 connexion_db = f"{tmp_name}/tmp.db" 256 elif self.get_connexion_type() != "": 257 connexion_db = self.get_connexion_type() 258 else: 259 connexion_db = default_connexion_db 260 261 # Set connexion db 262 self.connexion_db = connexion_db 263 264 return connexion_db 265 266 def set_connexion(self, conn) -> None: 267 """ 268 The function `set_connexion` creates a connection to a database, with options for different 269 database formats and settings. 270 271 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 272 database. If a connection is not provided, a new connection to an in-memory database is created. 273 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 274 sqlite 275 """ 276 277 # Connexion db 278 connexion_db = self.set_connexion_db() 279 280 # Connexion config 281 connexion_config = self.get_connexion_config() 282 283 # Connexion format 284 connexion_format = self.get_config().get("connexion_format", "duckdb") 285 # Set connexion format 286 self.connexion_format = connexion_format 287 288 # Connexion 289 if not conn: 290 if connexion_format in ["duckdb"]: 291 conn = duckdb.connect(connexion_db, config=connexion_config) 292 # duckDB settings 293 duckdb_settings = self.get_duckdb_settings() 294 if duckdb_settings: 295 for setting in duckdb_settings: 296 setting_value = duckdb_settings.get(setting) 297 if isinstance(setting_value, str): 298 setting_value = f"'{setting_value}'" 299 conn.execute(f"PRAGMA {setting}={setting_value};") 300 elif connexion_format in ["sqlite"]: 301 conn = sqlite3.connect(connexion_db) 302 303 # Set connexion 304 self.conn = conn 305 306 # Log 307 log.debug(f"connexion_format: {connexion_format}") 308 log.debug(f"connexion_db: {connexion_db}") 309 log.debug(f"connexion config: {connexion_config}") 310 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 311 312 def set_output(self, output: str = None) -> None: 313 """ 314 The `set_output` function in Python sets the output file based on the input or a specified key 315 in the config file, extracting the output name, extension, and format. 316 317 :param output: The `output` parameter in the `set_output` method is used to specify the name of 318 the output file. If the config file has an 'output' key, the method sets the output to the value 319 of that key. If no output is provided, it sets the output to `None` 320 :type output: str 321 """ 322 323 if output and not isinstance(output, str): 324 self.output = output.name 325 else: 326 self.output = output 327 328 # Output format 329 if self.output: 330 output_name, output_extension = os.path.splitext(self.output) 331 self.output_name = output_name 332 self.output_extension = output_extension 333 self.output_format = self.output_extension.replace(".", "") 334 else: 335 self.output_name = None 336 self.output_extension = None 337 self.output_format = None 338 339 def set_header(self) -> None: 340 """ 341 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 342 """ 343 344 input_file = self.get_input() 345 default_header_list = [ 346 "##fileformat=VCFv4.2", 347 "#CHROM POS ID REF ALT QUAL FILTER INFO", 348 ] 349 350 # Full path 351 input_file = full_path(input_file) 352 353 if input_file: 354 355 input_format = self.get_input_format() 356 input_compressed = self.get_input_compressed() 357 config = self.get_config() 358 header_list = default_header_list 359 if input_format in [ 360 "vcf", 361 "hdr", 362 "tsv", 363 "csv", 364 "psv", 365 "parquet", 366 "db", 367 "duckdb", 368 ]: 369 # header provided in param 370 if config.get("header_file", None): 371 with open(config.get("header_file"), "rt") as f: 372 header_list = self.read_vcf_header(f) 373 # within a vcf file format (header within input file itsself) 374 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 375 # within a compressed vcf file format (.vcf.gz) 376 if input_compressed: 377 with bgzf.open(input_file, "rt") as f: 378 header_list = self.read_vcf_header(f) 379 # within an uncompressed vcf file format (.vcf) 380 else: 381 with open(input_file, "rt") as f: 382 header_list = self.read_vcf_header(f) 383 # header provided in default external file .hdr 384 elif os.path.exists((input_file + ".hdr")): 385 with open(input_file + ".hdr", "rt") as f: 386 header_list = self.read_vcf_header(f) 387 else: 388 try: # Try to get header info fields and file columns 389 390 with tempfile.TemporaryDirectory() as tmpdir: 391 392 # Create database 393 db_for_header = Database(database=input_file) 394 395 # Get header columns for infos fields 396 db_header_from_columns = ( 397 db_for_header.get_header_from_columns() 398 ) 399 400 # Get real columns in the file 401 db_header_columns = db_for_header.get_columns() 402 403 # Write header file 404 header_file_tmp = os.path.join(tmpdir, "header") 405 f = open(header_file_tmp, "w") 406 vcf.Writer(f, db_header_from_columns) 407 f.close() 408 409 # Replace #CHROM line with rel columns 410 header_list = db_for_header.read_header_file( 411 header_file=header_file_tmp 412 ) 413 header_list[-1] = "\t".join(db_header_columns) 414 415 except: 416 417 log.warning( 418 f"No header for file {input_file}. Set as default VCF header" 419 ) 420 header_list = default_header_list 421 422 else: # try for unknown format ? 423 424 log.error(f"Input file format '{input_format}' not available") 425 raise ValueError(f"Input file format '{input_format}' not available") 426 427 if not header_list: 428 header_list = default_header_list 429 430 # header as list 431 self.header_list = header_list 432 433 # header as VCF object 434 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 435 436 else: 437 438 self.header_list = None 439 self.header_vcf = None 440 441 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 442 """ 443 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 444 DataFrame based on the connection format. 445 446 :param query: The `query` parameter in the `get_query_to_df` function is a string that 447 represents the SQL query you want to execute. This query will be used to fetch data from a 448 database and convert it into a pandas DataFrame 449 :type query: str 450 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 451 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 452 function will only fetch up to that number of rows from the database query result. If no limit 453 is specified, 454 :type limit: int 455 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 456 """ 457 458 # Connexion format 459 connexion_format = self.get_connexion_format() 460 461 # Limit in query 462 if limit: 463 pd.set_option("display.max_rows", limit) 464 if connexion_format in ["duckdb"]: 465 df = ( 466 self.conn.execute(query) 467 .fetch_record_batch(limit) 468 .read_next_batch() 469 .to_pandas() 470 ) 471 elif connexion_format in ["sqlite"]: 472 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 473 474 # Full query 475 else: 476 if connexion_format in ["duckdb"]: 477 df = self.conn.execute(query).df() 478 elif connexion_format in ["sqlite"]: 479 df = pd.read_sql_query(query, self.conn) 480 481 return df 482 483 def get_overview(self) -> None: 484 """ 485 The function prints the input, output, config, and dataframe of the current object 486 """ 487 table_variants_from = self.get_table_variants(clause="from") 488 sql_columns = self.get_header_columns_as_sql() 489 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 490 df = self.get_query_to_df(sql_query_export) 491 log.info( 492 "Input: " 493 + str(self.get_input()) 494 + " [" 495 + str(str(self.get_input_format())) 496 + "]" 497 ) 498 log.info( 499 "Output: " 500 + str(self.get_output()) 501 + " [" 502 + str(str(self.get_output_format())) 503 + "]" 504 ) 505 log.info("Config: ") 506 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 507 "\n" 508 ): 509 log.info("\t" + str(d)) 510 log.info("Param: ") 511 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 512 "\n" 513 ): 514 log.info("\t" + str(d)) 515 log.info("Sample list: " + str(self.get_header_sample_list())) 516 log.info("Dataframe: ") 517 for d in str(df).split("\n"): 518 log.info("\t" + str(d)) 519 520 # garbage collector 521 del df 522 gc.collect() 523 524 return None 525 526 def get_stats(self) -> dict: 527 """ 528 The `get_stats` function calculates and returns various statistics of the current object, 529 including information about the input file, variants, samples, header fields, quality, and 530 SNVs/InDels. 531 :return: a dictionary containing various statistics of the current object. The dictionary has 532 the following structure: 533 """ 534 535 # Log 536 log.info(f"Stats Calculation...") 537 538 # table varaints 539 table_variants_from = self.get_table_variants() 540 541 # stats dict 542 stats = {"Infos": {}} 543 544 ### File 545 input_file = self.get_input() 546 stats["Infos"]["Input file"] = input_file 547 548 # Header 549 header_infos = self.get_header().infos 550 header_formats = self.get_header().formats 551 header_infos_list = list(header_infos) 552 header_formats_list = list(header_formats) 553 554 ### Variants 555 556 stats["Variants"] = {} 557 558 # Variants by chr 559 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 560 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 561 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 562 by=["CHROM"], kind="quicksort" 563 ) 564 565 # Total number of variants 566 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 567 568 # Calculate percentage 569 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 570 lambda x: (x / nb_of_variants) 571 ) 572 573 stats["Variants"]["Number of variants by chromosome"] = ( 574 nb_of_variants_by_chrom.to_dict(orient="index") 575 ) 576 577 stats["Infos"]["Number of variants"] = int(nb_of_variants) 578 579 ### Samples 580 581 # Init 582 samples = {} 583 nb_of_samples = 0 584 585 # Check Samples 586 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 587 log.debug(f"Check samples...") 588 for sample in self.get_header_sample_list(): 589 sql_query_samples = f""" 590 SELECT '{sample}' as sample, 591 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 592 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 593 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 594 FROM {table_variants_from} 595 WHERE ( 596 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 597 AND 598 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 599 ) 600 GROUP BY genotype 601 """ 602 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 603 sample_genotype_count = sql_query_genotype_df["count"].sum() 604 if len(sql_query_genotype_df): 605 nb_of_samples += 1 606 samples[f"{sample} - {sample_genotype_count} variants"] = ( 607 sql_query_genotype_df.to_dict(orient="index") 608 ) 609 610 stats["Samples"] = samples 611 stats["Infos"]["Number of samples"] = nb_of_samples 612 613 # # 614 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 615 # stats["Infos"]["Number of samples"] = nb_of_samples 616 # elif nb_of_samples: 617 # stats["Infos"]["Number of samples"] = "not a VCF format" 618 619 ### INFO and FORMAT fields 620 header_types_df = {} 621 header_types_list = { 622 "List of INFO fields": header_infos, 623 "List of FORMAT fields": header_formats, 624 } 625 i = 0 626 for header_type in header_types_list: 627 628 header_type_infos = header_types_list.get(header_type) 629 header_infos_dict = {} 630 631 for info in header_type_infos: 632 633 i += 1 634 header_infos_dict[i] = {} 635 636 # ID 637 header_infos_dict[i]["id"] = info 638 639 # num 640 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 641 if header_type_infos[info].num in genotype_map.keys(): 642 header_infos_dict[i]["Number"] = genotype_map.get( 643 header_type_infos[info].num 644 ) 645 else: 646 header_infos_dict[i]["Number"] = header_type_infos[info].num 647 648 # type 649 if header_type_infos[info].type: 650 header_infos_dict[i]["Type"] = header_type_infos[info].type 651 else: 652 header_infos_dict[i]["Type"] = "." 653 654 # desc 655 if header_type_infos[info].desc != None: 656 header_infos_dict[i]["Description"] = header_type_infos[info].desc 657 else: 658 header_infos_dict[i]["Description"] = "" 659 660 if len(header_infos_dict): 661 header_types_df[header_type] = pd.DataFrame.from_dict( 662 header_infos_dict, orient="index" 663 ).to_dict(orient="index") 664 665 # Stats 666 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 667 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 668 stats["Header"] = header_types_df 669 670 ### QUAL 671 if "QUAL" in self.get_header_columns(): 672 sql_query_qual = f""" 673 SELECT 674 avg(CAST(QUAL AS INTEGER)) AS Average, 675 min(CAST(QUAL AS INTEGER)) AS Minimum, 676 max(CAST(QUAL AS INTEGER)) AS Maximum, 677 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 678 median(CAST(QUAL AS INTEGER)) AS Median, 679 variance(CAST(QUAL AS INTEGER)) AS Variance 680 FROM {table_variants_from} 681 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 682 """ 683 684 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 685 stats["Quality"] = {"Stats": qual} 686 687 ### SNV and InDel 688 689 sql_query_snv = f""" 690 691 SELECT Type, count FROM ( 692 693 SELECT 694 'Total' AS Type, 695 count(*) AS count 696 FROM {table_variants_from} 697 698 UNION 699 700 SELECT 701 'MNV' AS Type, 702 count(*) AS count 703 FROM {table_variants_from} 704 WHERE len(REF) > 1 AND len(ALT) > 1 705 AND len(REF) = len(ALT) 706 707 UNION 708 709 SELECT 710 'InDel' AS Type, 711 count(*) AS count 712 FROM {table_variants_from} 713 WHERE len(REF) > 1 OR len(ALT) > 1 714 AND len(REF) != len(ALT) 715 716 UNION 717 718 SELECT 719 'SNV' AS Type, 720 count(*) AS count 721 FROM {table_variants_from} 722 WHERE len(REF) = 1 AND len(ALT) = 1 723 724 ) 725 726 ORDER BY count DESC 727 728 """ 729 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 730 731 sql_query_snv_substitution = f""" 732 SELECT 733 concat(REF, '>', ALT) AS 'Substitution', 734 count(*) AS count 735 FROM {table_variants_from} 736 WHERE len(REF) = 1 AND len(ALT) = 1 737 GROUP BY REF, ALT 738 ORDER BY count(*) DESC 739 """ 740 snv_substitution = ( 741 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 742 ) 743 stats["Variants"]["Counts"] = snv_indel 744 stats["Variants"]["Substitutions"] = snv_substitution 745 746 return stats 747 748 def stats_to_file(self, file: str = None) -> str: 749 """ 750 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 751 into a JSON object, and writes the JSON object to the specified file. 752 753 :param file: The `file` parameter is a string that represents the file path where the JSON data 754 will be written 755 :type file: str 756 :return: the name of the file that was written to. 757 """ 758 759 # Get stats 760 stats = self.get_stats() 761 762 # Serializing json 763 json_object = json.dumps(stats, indent=4) 764 765 # Writing to sample.json 766 with open(file, "w") as outfile: 767 outfile.write(json_object) 768 769 return file 770 771 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 772 """ 773 The `print_stats` function generates a markdown file and prints the statistics contained in a 774 JSON file in a formatted manner. 775 776 :param output_file: The `output_file` parameter is a string that specifies the path and filename 777 of the output file where the stats will be printed in Markdown format. If no `output_file` is 778 provided, a temporary directory will be created and the stats will be saved in a file named 779 "stats.md" within that 780 :type output_file: str 781 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 782 file where the statistics will be saved. If no value is provided, a temporary directory will be 783 created and a default file name "stats.json" will be used 784 :type json_file: str 785 :return: The function `print_stats` does not return any value. It has a return type annotation 786 of `None`. 787 """ 788 789 # Full path 790 output_file = full_path(output_file) 791 json_file = full_path(json_file) 792 793 with tempfile.TemporaryDirectory() as tmpdir: 794 795 # Files 796 if not output_file: 797 output_file = os.path.join(tmpdir, "stats.md") 798 if not json_file: 799 json_file = os.path.join(tmpdir, "stats.json") 800 801 # Create folders 802 if not os.path.exists(os.path.dirname(output_file)): 803 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 804 if not os.path.exists(os.path.dirname(json_file)): 805 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 806 807 # Create stats JSON file 808 stats_file = self.stats_to_file(file=json_file) 809 810 # Print stats file 811 with open(stats_file) as f: 812 stats = yaml.safe_load(f) 813 814 # Output 815 output_title = [] 816 output_index = [] 817 output = [] 818 819 # Title 820 output_title.append("# HOWARD Stats") 821 822 # Index 823 output_index.append("## Index") 824 825 # Process sections 826 for section in stats: 827 infos = stats.get(section) 828 section_link = "#" + section.lower().replace(" ", "-") 829 output.append(f"## {section}") 830 output_index.append(f"- [{section}]({section_link})") 831 832 if len(infos): 833 for info in infos: 834 try: 835 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 836 is_df = True 837 except: 838 try: 839 df = pd.DataFrame.from_dict( 840 json.loads((infos.get(info))), orient="index" 841 ) 842 is_df = True 843 except: 844 is_df = False 845 if is_df: 846 output.append(f"### {info}") 847 info_link = "#" + info.lower().replace(" ", "-") 848 output_index.append(f" - [{info}]({info_link})") 849 output.append(f"{df.to_markdown(index=False)}") 850 else: 851 output.append(f"- {info}: {infos.get(info)}") 852 else: 853 output.append(f"NA") 854 855 # Write stats in markdown file 856 with open(output_file, "w") as fp: 857 for item in output_title: 858 fp.write("%s\n" % item) 859 for item in output_index: 860 fp.write("%s\n" % item) 861 for item in output: 862 fp.write("%s\n" % item) 863 864 # Output stats in markdown 865 print("") 866 print("\n\n".join(output_title)) 867 print("") 868 print("\n\n".join(output)) 869 print("") 870 871 return None 872 873 def get_input(self) -> str: 874 """ 875 It returns the value of the input variable. 876 :return: The input is being returned. 877 """ 878 return self.input 879 880 def get_input_format(self, input_file: str = None) -> str: 881 """ 882 This function returns the format of the input variable, either from the provided input file or 883 by prompting for input. 884 885 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 886 represents the file path of the input file. If no `input_file` is provided when calling the 887 method, it will default to `None` 888 :type input_file: str 889 :return: The format of the input variable is being returned. 890 """ 891 892 if not input_file: 893 input_file = self.get_input() 894 input_format = get_file_format(input_file) 895 return input_format 896 897 def get_input_compressed(self, input_file: str = None) -> str: 898 """ 899 The function `get_input_compressed` returns the format of the input variable after compressing 900 it. 901 902 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 903 that represents the file path of the input file. If no `input_file` is provided when calling the 904 method, it will default to `None` and the method will then call `self.get_input()` to 905 :type input_file: str 906 :return: The function `get_input_compressed` returns the compressed format of the input 907 variable. 908 """ 909 910 if not input_file: 911 input_file = self.get_input() 912 input_compressed = get_file_compressed(input_file) 913 return input_compressed 914 915 def get_output(self) -> str: 916 """ 917 It returns the output of the neuron. 918 :return: The output of the neural network. 919 """ 920 921 return self.output 922 923 def get_output_format(self, output_file: str = None) -> str: 924 """ 925 The function `get_output_format` returns the format of the input variable or the output file if 926 provided. 927 928 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 929 that represents the file path of the output file. If no `output_file` is provided when calling 930 the method, it will default to the output obtained from the `get_output` method of the class 931 instance. The 932 :type output_file: str 933 :return: The format of the input variable is being returned. 934 """ 935 936 if not output_file: 937 output_file = self.get_output() 938 output_format = get_file_format(output_file) 939 940 return output_format 941 942 def get_config(self) -> dict: 943 """ 944 It returns the config 945 :return: The config variable is being returned. 946 """ 947 return self.config 948 949 def get_param(self) -> dict: 950 """ 951 It returns the param 952 :return: The param variable is being returned. 953 """ 954 return self.param 955 956 def get_connexion_db(self) -> str: 957 """ 958 It returns the connexion_db attribute of the object 959 :return: The connexion_db is being returned. 960 """ 961 return self.connexion_db 962 963 def get_prefix(self) -> str: 964 """ 965 It returns the prefix of the object. 966 :return: The prefix is being returned. 967 """ 968 return self.prefix 969 970 def get_table_variants(self, clause: str = "select") -> str: 971 """ 972 This function returns the table_variants attribute of the object 973 974 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 975 defaults to select (optional) 976 :return: The table_variants attribute of the object. 977 """ 978 979 # Access 980 access = self.get_config().get("access", None) 981 982 # Clauses "select", "where", "update" 983 if clause in ["select", "where", "update"]: 984 table_variants = self.table_variants 985 # Clause "from" 986 elif clause in ["from"]: 987 # For Read Only 988 if self.get_input_format() in ["parquet"] and access in ["RO"]: 989 input_file = self.get_input() 990 table_variants = f"'{input_file}' as variants" 991 # For Read Write 992 else: 993 table_variants = f"{self.table_variants} as variants" 994 else: 995 table_variants = self.table_variants 996 return table_variants 997 998 def get_tmp_dir(self) -> str: 999 """ 1000 The function `get_tmp_dir` returns the temporary directory path based on configuration 1001 parameters or a default path. 1002 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1003 configuration, parameters, and a default value of "/tmp". 1004 """ 1005 1006 return get_tmp( 1007 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1008 ) 1009 1010 def get_connexion_type(self) -> str: 1011 """ 1012 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1013 1014 :return: The connexion type is being returned. 1015 """ 1016 return self.get_config().get("connexion_type", "memory") 1017 1018 def get_connexion(self): 1019 """ 1020 It returns the connection object 1021 1022 :return: The connection object. 1023 """ 1024 return self.conn 1025 1026 def close_connexion(self) -> None: 1027 """ 1028 This function closes the connection to the database. 1029 :return: The connection is being closed. 1030 """ 1031 return self.conn.close() 1032 1033 def get_header(self, type: str = "vcf"): 1034 """ 1035 This function returns the header of the VCF file as a list of strings 1036 1037 :param type: the type of header you want to get, defaults to vcf (optional) 1038 :return: The header of the vcf file. 1039 """ 1040 1041 if self.header_vcf: 1042 if type == "vcf": 1043 return self.header_vcf 1044 elif type == "list": 1045 return self.header_list 1046 else: 1047 if type == "vcf": 1048 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1049 return header 1050 elif type == "list": 1051 return vcf_required 1052 1053 def get_header_length(self, file: str = None) -> int: 1054 """ 1055 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1056 line. 1057 1058 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1059 header file. If this argument is provided, the function will read the header from the specified 1060 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1061 :type file: str 1062 :return: the length of the header list, excluding the #CHROM line. 1063 """ 1064 1065 if file: 1066 return len(self.read_vcf_header_file(file=file)) - 1 1067 elif self.get_header(type="list"): 1068 return len(self.get_header(type="list")) - 1 1069 else: 1070 return 0 1071 1072 def get_header_columns(self) -> str: 1073 """ 1074 This function returns the header list of a VCF 1075 1076 :return: The length of the header list. 1077 """ 1078 if self.get_header(): 1079 return self.get_header(type="list")[-1] 1080 else: 1081 return "" 1082 1083 def get_header_columns_as_list(self) -> list: 1084 """ 1085 This function returns the header list of a VCF 1086 1087 :return: The length of the header list. 1088 """ 1089 if self.get_header(): 1090 return self.get_header_columns().strip().split("\t") 1091 else: 1092 return [] 1093 1094 def get_header_columns_as_sql(self) -> str: 1095 """ 1096 This function retruns header length (without #CHROM line) 1097 1098 :return: The length of the header list. 1099 """ 1100 sql_column_list = [] 1101 for col in self.get_header_columns_as_list(): 1102 sql_column_list.append(f'"{col}"') 1103 return ",".join(sql_column_list) 1104 1105 def get_header_sample_list(self) -> list: 1106 """ 1107 This function retruns header length (without #CHROM line) 1108 1109 :return: The length of the header list. 1110 """ 1111 return self.header_vcf.samples 1112 1113 def get_verbose(self) -> bool: 1114 """ 1115 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1116 exist 1117 1118 :return: The value of the key "verbose" in the config dictionary. 1119 """ 1120 return self.get_config().get("verbose", False) 1121 1122 def get_connexion_format(self) -> str: 1123 """ 1124 It returns the connexion format of the object. 1125 :return: The connexion_format is being returned. 1126 """ 1127 connexion_format = self.connexion_format 1128 if connexion_format not in ["duckdb", "sqlite"]: 1129 log.error(f"Unknown connexion format {connexion_format}") 1130 raise ValueError(f"Unknown connexion format {connexion_format}") 1131 else: 1132 return connexion_format 1133 1134 def insert_file_to_table( 1135 self, 1136 file, 1137 columns: str, 1138 header_len: int = 0, 1139 sep: str = "\t", 1140 chunksize: int = 1000000, 1141 ) -> None: 1142 """ 1143 The function reads a file in chunks and inserts each chunk into a table based on the specified 1144 database format. 1145 1146 :param file: The `file` parameter is the file that you want to load into a table. It should be 1147 the path to the file on your system 1148 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1149 should contain the names of the columns in the table where the data will be inserted. The column 1150 names should be separated by commas within the string. For example, if you have columns named 1151 "id", "name 1152 :type columns: str 1153 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1154 the number of lines to skip at the beginning of the file before reading the actual data. This 1155 parameter allows you to skip any header information present in the file before processing the 1156 data, defaults to 0 1157 :type header_len: int (optional) 1158 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1159 separator character that is used in the file being read. In this case, the default separator is 1160 set to `\t`, which represents a tab character. You can change this parameter to a different 1161 separator character if, defaults to \t 1162 :type sep: str (optional) 1163 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1164 when processing the file in chunks. In the provided code snippet, the default value for 1165 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1166 to 1000000 1167 :type chunksize: int (optional) 1168 """ 1169 1170 # Config 1171 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1172 connexion_format = self.get_connexion_format() 1173 1174 log.debug("chunksize: " + str(chunksize)) 1175 1176 if chunksize: 1177 for chunk in pd.read_csv( 1178 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1179 ): 1180 if connexion_format in ["duckdb"]: 1181 sql_insert_into = ( 1182 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1183 ) 1184 self.conn.execute(sql_insert_into) 1185 elif connexion_format in ["sqlite"]: 1186 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1187 1188 def load_data( 1189 self, 1190 input_file: str = None, 1191 drop_variants_table: bool = False, 1192 sample_size: int = 20480, 1193 ) -> None: 1194 """ 1195 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1196 table before loading the data and specify a sample size. 1197 1198 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1199 table 1200 :type input_file: str 1201 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1202 determines whether the variants table should be dropped before loading the data. If set to 1203 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1204 not be dropped, defaults to False 1205 :type drop_variants_table: bool (optional) 1206 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1207 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1208 20480 1209 :type sample_size: int (optional) 1210 """ 1211 1212 log.info("Loading...") 1213 1214 # change input file 1215 if input_file: 1216 self.set_input(input_file) 1217 self.set_header() 1218 1219 # drop variants table 1220 if drop_variants_table: 1221 self.drop_variants_table() 1222 1223 # get table variants 1224 table_variants = self.get_table_variants() 1225 1226 # Access 1227 access = self.get_config().get("access", None) 1228 log.debug(f"access: {access}") 1229 1230 # Input format and compress 1231 input_format = self.get_input_format() 1232 input_compressed = self.get_input_compressed() 1233 log.debug(f"input_format: {input_format}") 1234 log.debug(f"input_compressed: {input_compressed}") 1235 1236 # input_compressed_format 1237 if input_compressed: 1238 input_compressed_format = "gzip" 1239 else: 1240 input_compressed_format = "none" 1241 log.debug(f"input_compressed_format: {input_compressed_format}") 1242 1243 # Connexion format 1244 connexion_format = self.get_connexion_format() 1245 1246 # Sample size 1247 if not sample_size: 1248 sample_size = -1 1249 log.debug(f"sample_size: {sample_size}") 1250 1251 # Load data 1252 log.debug(f"Load Data from {input_format}") 1253 1254 # DuckDB connexion 1255 if connexion_format in ["duckdb"]: 1256 1257 # Database already exists 1258 if self.input_format in ["db", "duckdb"]: 1259 1260 if connexion_format in ["duckdb"]: 1261 log.debug(f"Input file format '{self.input_format}' duckDB") 1262 else: 1263 log.error( 1264 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1265 ) 1266 raise ValueError( 1267 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1268 ) 1269 1270 # Load from existing database format 1271 else: 1272 1273 try: 1274 # Create Table or View 1275 database = Database(database=self.input) 1276 sql_from = database.get_sql_from(sample_size=sample_size) 1277 1278 if access in ["RO"]: 1279 sql_load = ( 1280 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1281 ) 1282 else: 1283 sql_load = ( 1284 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1285 ) 1286 self.conn.execute(sql_load) 1287 1288 except: 1289 # Format not available 1290 log.error(f"Input file format '{self.input_format}' not available") 1291 raise ValueError( 1292 f"Input file format '{self.input_format}' not available" 1293 ) 1294 1295 # SQLite connexion 1296 elif connexion_format in ["sqlite"] and input_format in [ 1297 "vcf", 1298 "tsv", 1299 "csv", 1300 "psv", 1301 ]: 1302 1303 # Main structure 1304 structure = { 1305 "#CHROM": "VARCHAR", 1306 "POS": "INTEGER", 1307 "ID": "VARCHAR", 1308 "REF": "VARCHAR", 1309 "ALT": "VARCHAR", 1310 "QUAL": "VARCHAR", 1311 "FILTER": "VARCHAR", 1312 "INFO": "VARCHAR", 1313 } 1314 1315 # Strcuture with samples 1316 structure_complete = structure 1317 if self.get_header_sample_list(): 1318 structure["FORMAT"] = "VARCHAR" 1319 for sample in self.get_header_sample_list(): 1320 structure_complete[sample] = "VARCHAR" 1321 1322 # Columns list for create and insert 1323 sql_create_table_columns = [] 1324 sql_create_table_columns_list = [] 1325 for column in structure_complete: 1326 column_type = structure_complete[column] 1327 sql_create_table_columns.append( 1328 f'"{column}" {column_type} default NULL' 1329 ) 1330 sql_create_table_columns_list.append(f'"{column}"') 1331 1332 # Create database 1333 log.debug(f"Create Table {table_variants}") 1334 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1335 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1336 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1337 self.conn.execute(sql_create_table) 1338 1339 # chunksize define length of file chunk load file 1340 chunksize = 100000 1341 1342 # delimiter 1343 delimiter = file_format_delimiters.get(input_format, "\t") 1344 1345 # Load the input file 1346 with open(self.input, "rt") as input_file: 1347 1348 # Use the appropriate file handler based on the input format 1349 if input_compressed: 1350 input_file = bgzf.open(self.input, "rt") 1351 if input_format in ["vcf"]: 1352 header_len = self.get_header_length() 1353 else: 1354 header_len = 0 1355 1356 # Insert the file contents into a table 1357 self.insert_file_to_table( 1358 input_file, 1359 columns=sql_create_table_columns_list_sql, 1360 header_len=header_len, 1361 sep=delimiter, 1362 chunksize=chunksize, 1363 ) 1364 1365 else: 1366 log.error( 1367 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1368 ) 1369 raise ValueError( 1370 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1371 ) 1372 1373 # Explode INFOS fields into table fields 1374 if self.get_explode_infos(): 1375 self.explode_infos( 1376 prefix=self.get_explode_infos_prefix(), 1377 fields=self.get_explode_infos_fields(), 1378 force=True, 1379 ) 1380 1381 # Create index after insertion 1382 self.create_indexes() 1383 1384 def get_explode_infos(self) -> bool: 1385 """ 1386 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1387 to False if it is not set. 1388 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1389 value. If the parameter is not present, it will return False. 1390 """ 1391 1392 return self.get_param().get("explode", {}).get("explode_infos", False) 1393 1394 def get_explode_infos_fields( 1395 self, 1396 explode_infos_fields: str = None, 1397 remove_fields_not_in_header: bool = False, 1398 ) -> list: 1399 """ 1400 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1401 the input parameter `explode_infos_fields`. 1402 1403 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1404 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1405 comma-separated list of field names to explode 1406 :type explode_infos_fields: str 1407 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1408 flag that determines whether to remove fields that are not present in the header. If it is set 1409 to `True`, any field that is not in the header will be excluded from the list of exploded 1410 information fields. If it is set to `, defaults to False 1411 :type remove_fields_not_in_header: bool (optional) 1412 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1413 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1414 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1415 Otherwise, it returns a list of exploded information fields after removing any spaces and 1416 splitting the string by commas. 1417 """ 1418 1419 # If no fields, get it in param 1420 if not explode_infos_fields: 1421 explode_infos_fields = ( 1422 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1423 ) 1424 1425 # If no fields, defined as all fields in header using keyword 1426 if not explode_infos_fields: 1427 explode_infos_fields = "*" 1428 1429 # If fields list not empty 1430 if explode_infos_fields: 1431 1432 # Input fields list 1433 if isinstance(explode_infos_fields, str): 1434 fields_input = explode_infos_fields.split(",") 1435 elif isinstance(explode_infos_fields, list): 1436 fields_input = explode_infos_fields 1437 else: 1438 fields_input = [] 1439 1440 # Fields list without * keyword 1441 fields_without_all = fields_input.copy() 1442 if "*".casefold() in (item.casefold() for item in fields_without_all): 1443 fields_without_all.remove("*") 1444 1445 # Fields in header 1446 fields_in_header = sorted(list(set(self.get_header().infos))) 1447 1448 # Construct list of fields 1449 fields_output = [] 1450 for field in fields_input: 1451 1452 # Strip field 1453 field = field.strip() 1454 1455 # format keyword * in regex 1456 if field.upper() in ["*"]: 1457 field = ".*" 1458 1459 # Find all fields with pattern 1460 r = re.compile(field) 1461 fields_search = sorted(list(filter(r.match, fields_in_header))) 1462 1463 # Remove fields input from search 1464 if field in fields_search: 1465 fields_search = [field] 1466 elif fields_search != [field]: 1467 fields_search = sorted( 1468 list(set(fields_search).difference(fields_input)) 1469 ) 1470 1471 # If field is not in header (avoid not well formatted header) 1472 if not fields_search and not remove_fields_not_in_header: 1473 fields_search = [field] 1474 1475 # Add found fields 1476 for new_field in fields_search: 1477 # Add field, if not already exists, and if it is in header (if asked) 1478 if ( 1479 new_field not in fields_output 1480 and ( 1481 not remove_fields_not_in_header 1482 or new_field in fields_in_header 1483 ) 1484 and new_field not in [".*"] 1485 ): 1486 fields_output.append(new_field) 1487 1488 return fields_output 1489 1490 else: 1491 1492 return [] 1493 1494 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1495 """ 1496 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1497 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1498 not provided. 1499 1500 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1501 prefix to be used for exploding or expanding information 1502 :type explode_infos_prefix: str 1503 :return: the value of the variable `explode_infos_prefix`. 1504 """ 1505 1506 if not explode_infos_prefix: 1507 explode_infos_prefix = ( 1508 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1509 ) 1510 1511 return explode_infos_prefix 1512 1513 def add_column( 1514 self, 1515 table_name, 1516 column_name, 1517 column_type, 1518 default_value=None, 1519 drop: bool = False, 1520 ) -> dict: 1521 """ 1522 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1523 doesn't already exist. 1524 1525 :param table_name: The name of the table to which you want to add a column 1526 :param column_name: The parameter "column_name" is the name of the column that you want to add 1527 to the table 1528 :param column_type: The `column_type` parameter specifies the data type of the column that you 1529 want to add to the table. It should be a string that represents the desired data type, such as 1530 "INTEGER", "TEXT", "REAL", etc 1531 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1532 default value for the newly added column. If a default value is provided, it will be assigned to 1533 the column for any existing rows that do not have a value for that column 1534 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1535 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1536 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1537 to False 1538 :type drop: bool (optional) 1539 :return: a boolean value indicating whether the column was successfully added to the table. 1540 """ 1541 1542 # added 1543 added = False 1544 dropped = False 1545 1546 # Check if the column already exists in the table 1547 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1548 columns = self.get_query_to_df(query).columns.tolist() 1549 if column_name.upper() in [c.upper() for c in columns]: 1550 log.debug( 1551 f"The {column_name} column already exists in the {table_name} table" 1552 ) 1553 if drop: 1554 self.drop_column(table_name=table_name, column_name=column_name) 1555 dropped = True 1556 else: 1557 return None 1558 else: 1559 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1560 1561 # Add column in table 1562 add_column_query = ( 1563 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1564 ) 1565 if default_value is not None: 1566 add_column_query += f" DEFAULT {default_value}" 1567 self.execute_query(add_column_query) 1568 added = not dropped 1569 log.debug( 1570 f"The {column_name} column was successfully added to the {table_name} table" 1571 ) 1572 1573 if added: 1574 added_column = { 1575 "table_name": table_name, 1576 "column_name": column_name, 1577 "column_type": column_type, 1578 "default_value": default_value, 1579 } 1580 else: 1581 added_column = None 1582 1583 return added_column 1584 1585 def drop_column( 1586 self, column: dict = None, table_name: str = None, column_name: str = None 1587 ) -> bool: 1588 """ 1589 The `drop_column` function drops a specified column from a given table in a database and returns 1590 True if the column was successfully dropped, and False if the column does not exist in the 1591 table. 1592 1593 :param column: The `column` parameter is a dictionary that contains information about the column 1594 you want to drop. It has two keys: 1595 :type column: dict 1596 :param table_name: The `table_name` parameter is the name of the table from which you want to 1597 drop a column 1598 :type table_name: str 1599 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1600 from the table 1601 :type column_name: str 1602 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1603 and False if the column does not exist in the table. 1604 """ 1605 1606 # Find column infos 1607 if column: 1608 if isinstance(column, dict): 1609 table_name = column.get("table_name", None) 1610 column_name = column.get("column_name", None) 1611 elif isinstance(column, str): 1612 table_name = self.get_table_variants() 1613 column_name = column 1614 else: 1615 table_name = None 1616 column_name = None 1617 1618 if not table_name and not column_name: 1619 return False 1620 1621 # Removed 1622 removed = False 1623 1624 # Check if the column already exists in the table 1625 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1626 columns = self.get_query_to_df(query).columns.tolist() 1627 if column_name in columns: 1628 log.debug(f"The {column_name} column exists in the {table_name} table") 1629 else: 1630 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1631 return False 1632 1633 # Add column in table # ALTER TABLE integers DROP k 1634 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1635 self.execute_query(add_column_query) 1636 removed = True 1637 log.debug( 1638 f"The {column_name} column was successfully dropped to the {table_name} table" 1639 ) 1640 1641 return removed 1642 1643 def explode_infos( 1644 self, 1645 prefix: str = None, 1646 create_index: bool = False, 1647 fields: list = None, 1648 force: bool = False, 1649 proccess_all_fields_together: bool = False, 1650 table: str = None, 1651 ) -> list: 1652 """ 1653 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1654 individual columns, returning a list of added columns. 1655 1656 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1657 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1658 `self.get_explode_infos_prefix()` as the prefix 1659 :type prefix: str 1660 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1661 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1662 `False`, indexes will not be created. The default value is `False`, defaults to False 1663 :type create_index: bool (optional) 1664 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1665 that you want to explode into individual columns. If this parameter is not provided, all INFO 1666 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1667 a list to the ` 1668 :type fields: list 1669 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1670 determines whether to drop and recreate a column if it already exists in the table. If `force` 1671 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1672 defaults to False 1673 :type force: bool (optional) 1674 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1675 flag that determines whether to process all the INFO fields together or individually. If set to 1676 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1677 be processed individually. The default value is, defaults to False 1678 :type proccess_all_fields_together: bool (optional) 1679 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1680 of the table where the exploded INFO fields will be added as individual columns. If you provide 1681 a value for the `table` parameter, the function will use that table name. If the `table` 1682 parameter is 1683 :type table: str 1684 :return: The `explode_infos` function returns a list of added columns. 1685 """ 1686 1687 # drop indexes 1688 self.drop_indexes() 1689 1690 # connexion format 1691 connexion_format = self.get_connexion_format() 1692 1693 # Access 1694 access = self.get_config().get("access", None) 1695 1696 # Added columns 1697 added_columns = [] 1698 1699 if access not in ["RO"]: 1700 1701 # prefix 1702 if prefix in [None, True] or not isinstance(prefix, str): 1703 if self.get_explode_infos_prefix() not in [None, True]: 1704 prefix = self.get_explode_infos_prefix() 1705 else: 1706 prefix = "INFO/" 1707 1708 # table variants 1709 if table is not None: 1710 table_variants = table 1711 else: 1712 table_variants = self.get_table_variants(clause="select") 1713 1714 # extra infos 1715 try: 1716 extra_infos = self.get_extra_infos() 1717 except: 1718 extra_infos = [] 1719 1720 # Header infos 1721 header_infos = self.get_header().infos 1722 1723 log.debug( 1724 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1725 ) 1726 1727 sql_info_alter_table_array = [] 1728 1729 # Info fields to check 1730 fields_list = list(header_infos) 1731 if fields: 1732 fields_list += fields 1733 fields_list = set(fields_list) 1734 1735 # If no fields 1736 if not fields: 1737 fields = [] 1738 1739 # Translate fields if patterns 1740 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1741 1742 for info in fields: 1743 1744 info_id_sql = prefix + info 1745 1746 if ( 1747 info in fields_list 1748 or prefix + info in fields_list 1749 or info in extra_infos 1750 ): 1751 1752 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1753 1754 if info in header_infos: 1755 info_type = header_infos[info].type 1756 info_num = header_infos[info].num 1757 else: 1758 info_type = "String" 1759 info_num = 0 1760 1761 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1762 if info_num != 1: 1763 type_sql = "VARCHAR" 1764 1765 # Add field 1766 added_column = self.add_column( 1767 table_name=table_variants, 1768 column_name=info_id_sql, 1769 column_type=type_sql, 1770 default_value="null", 1771 drop=force, 1772 ) 1773 1774 if added_column: 1775 added_columns.append(added_column) 1776 1777 if added_column or force: 1778 1779 # add field to index 1780 self.index_additionnal_fields.append(info_id_sql) 1781 1782 # Update field array 1783 if connexion_format in ["duckdb"]: 1784 update_info_field = f""" 1785 "{info_id_sql}" = 1786 CASE 1787 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1788 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1789 END 1790 """ 1791 elif connexion_format in ["sqlite"]: 1792 update_info_field = f""" 1793 "{info_id_sql}" = 1794 CASE 1795 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1796 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1797 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1798 END 1799 """ 1800 1801 sql_info_alter_table_array.append(update_info_field) 1802 1803 if sql_info_alter_table_array: 1804 1805 # By chromosomes 1806 try: 1807 chromosomes_list = list( 1808 self.get_query_to_df( 1809 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1810 )["#CHROM"] 1811 ) 1812 except: 1813 chromosomes_list = [None] 1814 1815 for chrom in chromosomes_list: 1816 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1817 1818 # Where clause 1819 where_clause = "" 1820 if chrom and len(chromosomes_list) > 1: 1821 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1822 1823 # Update table 1824 if proccess_all_fields_together: 1825 sql_info_alter_table_array_join = ", ".join( 1826 sql_info_alter_table_array 1827 ) 1828 if sql_info_alter_table_array_join: 1829 sql_info_alter_table = f""" 1830 UPDATE {table_variants} 1831 SET {sql_info_alter_table_array_join} 1832 {where_clause} 1833 """ 1834 log.debug( 1835 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1836 ) 1837 # log.debug(sql_info_alter_table) 1838 self.conn.execute(sql_info_alter_table) 1839 else: 1840 sql_info_alter_num = 0 1841 for sql_info_alter in sql_info_alter_table_array: 1842 sql_info_alter_num += 1 1843 sql_info_alter_table = f""" 1844 UPDATE {table_variants} 1845 SET {sql_info_alter} 1846 {where_clause} 1847 """ 1848 log.debug( 1849 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1850 ) 1851 # log.debug(sql_info_alter_table) 1852 self.conn.execute(sql_info_alter_table) 1853 1854 # create indexes 1855 if create_index: 1856 self.create_indexes() 1857 1858 return added_columns 1859 1860 def create_indexes(self) -> None: 1861 """ 1862 Create indexes on the table after insertion 1863 """ 1864 1865 # Access 1866 access = self.get_config().get("access", None) 1867 1868 # get table variants 1869 table_variants = self.get_table_variants("FROM") 1870 1871 if self.get_indexing() and access not in ["RO"]: 1872 # Create index 1873 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1874 self.conn.execute(sql_create_table_index) 1875 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1876 self.conn.execute(sql_create_table_index) 1877 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1878 self.conn.execute(sql_create_table_index) 1879 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1880 self.conn.execute(sql_create_table_index) 1881 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1882 self.conn.execute(sql_create_table_index) 1883 for field in self.index_additionnal_fields: 1884 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1885 self.conn.execute(sql_create_table_index) 1886 1887 def drop_indexes(self) -> None: 1888 """ 1889 Create indexes on the table after insertion 1890 """ 1891 1892 # Access 1893 access = self.get_config().get("access", None) 1894 1895 # get table variants 1896 table_variants = self.get_table_variants("FROM") 1897 1898 # Get database format 1899 connexion_format = self.get_connexion_format() 1900 1901 if access not in ["RO"]: 1902 if connexion_format in ["duckdb"]: 1903 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1904 elif connexion_format in ["sqlite"]: 1905 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1906 1907 list_indexes = self.conn.execute(sql_list_indexes) 1908 index_names = [row[0] for row in list_indexes.fetchall()] 1909 for index in index_names: 1910 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1911 self.conn.execute(sql_drop_table_index) 1912 1913 def read_vcf_header(self, f) -> list: 1914 """ 1915 It reads the header of a VCF file and returns a list of the header lines 1916 1917 :param f: the file object 1918 :return: The header lines of the VCF file. 1919 """ 1920 1921 header_list = [] 1922 for line in f: 1923 header_list.append(line) 1924 if line.startswith("#CHROM"): 1925 break 1926 return header_list 1927 1928 def read_vcf_header_file(self, file: str = None) -> list: 1929 """ 1930 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 1931 uncompressed files. 1932 1933 :param file: The `file` parameter is a string that represents the path to the VCF header file 1934 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1935 default to `None` 1936 :type file: str 1937 :return: The function `read_vcf_header_file` returns a list. 1938 """ 1939 1940 if self.get_input_compressed(input_file=file): 1941 with bgzf.open(file, "rt") as f: 1942 return self.read_vcf_header(f=f) 1943 else: 1944 with open(file, "rt") as f: 1945 return self.read_vcf_header(f=f) 1946 1947 def execute_query(self, query: str): 1948 """ 1949 It takes a query as an argument, executes it, and returns the results 1950 1951 :param query: The query to be executed 1952 :return: The result of the query is being returned. 1953 """ 1954 if query: 1955 return self.conn.execute(query) # .fetchall() 1956 else: 1957 return None 1958 1959 def export_output( 1960 self, 1961 output_file: str | None = None, 1962 output_header: str | None = None, 1963 export_header: bool = True, 1964 query: str | None = None, 1965 parquet_partitions: list | None = None, 1966 chunk_size: int | None = None, 1967 threads: int | None = None, 1968 sort: bool = False, 1969 index: bool = False, 1970 order_by: str | None = None, 1971 ) -> bool: 1972 """ 1973 The `export_output` function exports data from a VCF file to a specified output file in various 1974 formats, including VCF, CSV, TSV, PSV, and Parquet. 1975 1976 :param output_file: The `output_file` parameter is a string that specifies the name of the 1977 output file to be generated by the function. This is where the exported data will be saved 1978 :type output_file: str 1979 :param output_header: The `output_header` parameter is a string that specifies the name of the 1980 file where the header of the VCF file will be exported. If this parameter is not provided, the 1981 header will be exported to a file with the same name as the `output_file` parameter, but with 1982 the extension " 1983 :type output_header: str 1984 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1985 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1986 True, the header will be exported to a file. If `export_header` is False, the header will not 1987 be, defaults to True, if output format is not VCF 1988 :type export_header: bool (optional) 1989 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1990 select specific data from the VCF file before exporting it. If provided, only the data that 1991 matches the query will be exported 1992 :type query: str 1993 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1994 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1995 organize data in a hierarchical directory structure based on the values of one or more columns. 1996 This can improve query performance when working with large datasets 1997 :type parquet_partitions: list 1998 :param chunk_size: The `chunk_size` parameter specifies the number of 1999 records in batch when exporting data in Parquet format. This parameter is used for 2000 partitioning the Parquet file into multiple files. 2001 :type chunk_size: int 2002 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2003 threads to be used during the export process. It determines the level of parallelism and can 2004 improve the performance of the export operation. If not provided, the function will use the 2005 default number of threads 2006 :type threads: int 2007 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2008 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2009 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2010 False 2011 :type sort: bool (optional) 2012 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2013 created on the output file. If `index` is True, an index will be created. If `index` is False, 2014 no index will be created. The default value is False, defaults to False 2015 :type index: bool (optional) 2016 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2017 sorting the output file. This parameter is only applicable when exporting data in VCF format 2018 :type order_by: str 2019 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2020 None if it doesn't. 2021 """ 2022 2023 # Log 2024 log.info("Exporting...") 2025 2026 # Full path 2027 output_file = full_path(output_file) 2028 output_header = full_path(output_header) 2029 2030 # Config 2031 config = self.get_config() 2032 2033 # Param 2034 param = self.get_param() 2035 2036 # Tmp files to remove 2037 tmp_to_remove = [] 2038 2039 # If no output, get it 2040 if not output_file: 2041 output_file = self.get_output() 2042 2043 # If not threads 2044 if not threads: 2045 threads = self.get_threads() 2046 2047 # Auto header name with extension 2048 if export_header or output_header: 2049 if not output_header: 2050 output_header = f"{output_file}.hdr" 2051 # Export header 2052 self.export_header(output_file=output_file) 2053 2054 # Switch off export header if VCF output 2055 output_file_type = get_file_format(output_file) 2056 if output_file_type in ["vcf"]: 2057 export_header = False 2058 tmp_to_remove.append(output_header) 2059 2060 # Chunk size 2061 if not chunk_size: 2062 chunk_size = config.get("chunk_size", None) 2063 2064 # Parquet partition 2065 if not parquet_partitions: 2066 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2067 if parquet_partitions and isinstance(parquet_partitions, str): 2068 parquet_partitions = parquet_partitions.split(",") 2069 2070 # Order by 2071 if not order_by: 2072 order_by = param.get("export", {}).get("order_by", "") 2073 2074 # Header in output 2075 header_in_output = param.get("export", {}).get("include_header", False) 2076 2077 # Database 2078 database_source = self.get_connexion() 2079 2080 # Connexion format 2081 connexion_format = self.get_connexion_format() 2082 2083 # Explode infos 2084 if self.get_explode_infos(): 2085 self.explode_infos( 2086 prefix=self.get_explode_infos_prefix(), 2087 fields=self.get_explode_infos_fields(), 2088 force=False, 2089 ) 2090 2091 # if connexion_format in ["sqlite"] or query: 2092 if connexion_format in ["sqlite"]: 2093 2094 # Export in Parquet 2095 random_tmp = "".join( 2096 random.choice(string.ascii_lowercase) for i in range(10) 2097 ) 2098 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2099 tmp_to_remove.append(database_source) 2100 2101 # Table Variants 2102 table_variants = self.get_table_variants() 2103 2104 # Create export query 2105 sql_query_export_subquery = f""" 2106 SELECT * FROM {table_variants} 2107 """ 2108 2109 # Write source file 2110 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2111 2112 # Create database 2113 database = Database( 2114 database=database_source, 2115 table="variants", 2116 header_file=output_header, 2117 conn_config=self.get_connexion_config(), 2118 ) 2119 2120 # Existing colomns header 2121 # existing_columns_header = database.get_header_file_columns(output_header) 2122 existing_columns_header = database.get_header_columns_from_database() 2123 2124 # Export file 2125 database.export( 2126 output_database=output_file, 2127 output_header=output_header, 2128 existing_columns_header=existing_columns_header, 2129 parquet_partitions=parquet_partitions, 2130 chunk_size=chunk_size, 2131 threads=threads, 2132 sort=sort, 2133 index=index, 2134 header_in_output=header_in_output, 2135 order_by=order_by, 2136 query=query, 2137 export_header=export_header, 2138 ) 2139 2140 # Remove 2141 remove_if_exists(tmp_to_remove) 2142 2143 return (os.path.exists(output_file) or None) and ( 2144 os.path.exists(output_file) or None 2145 ) 2146 2147 def get_extra_infos(self, table: str = None) -> list: 2148 """ 2149 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2150 in the header. 2151 2152 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2153 name of the table from which you want to retrieve the extra columns that are not present in the 2154 header. If the `table` parameter is not provided when calling the function, it will default to 2155 using the variants 2156 :type table: str 2157 :return: A list of columns that are in the specified table but not in the header of the table. 2158 """ 2159 2160 header_columns = [] 2161 2162 if not table: 2163 table = self.get_table_variants(clause="from") 2164 header_columns = self.get_header_columns() 2165 2166 # Check all columns in the database 2167 query = f""" SELECT * FROM {table} LIMIT 1 """ 2168 log.debug(f"query {query}") 2169 table_columns = self.get_query_to_df(query).columns.tolist() 2170 extra_columns = [] 2171 2172 # Construct extra infos (not in header) 2173 for column in table_columns: 2174 if column not in header_columns: 2175 extra_columns.append(column) 2176 2177 return extra_columns 2178 2179 def get_extra_infos_sql(self, table: str = None) -> str: 2180 """ 2181 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2182 by double quotes 2183 2184 :param table: The name of the table to get the extra infos from. If None, the default table is 2185 used 2186 :type table: str 2187 :return: A string of the extra infos 2188 """ 2189 2190 return ", ".join( 2191 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2192 ) 2193 2194 def export_header( 2195 self, 2196 header_name: str = None, 2197 output_file: str = None, 2198 output_file_ext: str = ".hdr", 2199 clean_header: bool = True, 2200 remove_chrom_line: bool = False, 2201 ) -> str: 2202 """ 2203 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2204 specified options, and writes it to a new file. 2205 2206 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2207 this parameter is not specified, the header will be written to the output file 2208 :type header_name: str 2209 :param output_file: The `output_file` parameter in the `export_header` function is used to 2210 specify the name of the output file where the header will be written. If this parameter is not 2211 provided, the header will be written to a temporary file 2212 :type output_file: str 2213 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2214 string that represents the extension of the output header file. By default, it is set to ".hdr" 2215 if not specified by the user. This extension will be appended to the `output_file` name to 2216 create the final, defaults to .hdr 2217 :type output_file_ext: str (optional) 2218 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2219 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2220 `True`, the function will clean the header by modifying certain lines based on a specific 2221 pattern. If `clean_header`, defaults to True 2222 :type clean_header: bool (optional) 2223 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2224 boolean flag that determines whether the #CHROM line should be removed from the header before 2225 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2226 defaults to False 2227 :type remove_chrom_line: bool (optional) 2228 :return: The function `export_header` returns the name of the temporary header file that is 2229 created. 2230 """ 2231 2232 if not header_name and not output_file: 2233 output_file = self.get_output() 2234 2235 if self.get_header(): 2236 2237 # Get header object 2238 header_obj = self.get_header() 2239 2240 # Create database 2241 db_for_header = Database(database=self.get_input()) 2242 2243 # Get real columns in the file 2244 db_header_columns = db_for_header.get_columns() 2245 2246 with tempfile.TemporaryDirectory() as tmpdir: 2247 2248 # Write header file 2249 header_file_tmp = os.path.join(tmpdir, "header") 2250 f = open(header_file_tmp, "w") 2251 vcf.Writer(f, header_obj) 2252 f.close() 2253 2254 # Replace #CHROM line with rel columns 2255 header_list = db_for_header.read_header_file( 2256 header_file=header_file_tmp 2257 ) 2258 header_list[-1] = "\t".join(db_header_columns) 2259 2260 # Remove CHROM line 2261 if remove_chrom_line: 2262 header_list.pop() 2263 2264 # Clean header 2265 if clean_header: 2266 header_list_clean = [] 2267 for head in header_list: 2268 # Clean head for malformed header 2269 head_clean = head 2270 head_clean = re.subn( 2271 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2272 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2273 head_clean, 2274 2, 2275 )[0] 2276 # Write header 2277 header_list_clean.append(head_clean) 2278 header_list = header_list_clean 2279 2280 tmp_header_name = output_file + output_file_ext 2281 2282 f = open(tmp_header_name, "w") 2283 for line in header_list: 2284 f.write(line) 2285 f.close() 2286 2287 return tmp_header_name 2288 2289 def export_variant_vcf( 2290 self, 2291 vcf_file, 2292 remove_info: bool = False, 2293 add_samples: bool = True, 2294 list_samples: list = [], 2295 where_clause: str = "", 2296 index: bool = False, 2297 threads: int | None = None, 2298 ) -> bool | None: 2299 """ 2300 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2301 remove INFO field, add samples, and control compression and indexing. 2302 2303 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2304 written to. It is the output file that will contain the filtered VCF data based on the specified 2305 parameters 2306 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2307 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2308 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2309 in, defaults to False 2310 :type remove_info: bool (optional) 2311 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2312 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2313 If set to False, the samples will be removed. The default value is True, defaults to True 2314 :type add_samples: bool (optional) 2315 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2316 in the output VCF file. By default, all samples will be included. If you provide a list of 2317 samples, only those samples will be included in the output file 2318 :type list_samples: list 2319 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2320 determines whether or not to create an index for the output VCF file. If `index` is set to 2321 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2322 :type index: bool (optional) 2323 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2324 number of threads to use for exporting the VCF file. It determines how many parallel threads 2325 will be used during the export process. More threads can potentially speed up the export process 2326 by utilizing multiple cores of the processor. If 2327 :type threads: int | None 2328 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2329 method with various parameters including the output file, query, threads, sort flag, and index 2330 flag. The `export_output` method is responsible for exporting the VCF data based on the 2331 specified parameters and configurations provided in the `export_variant_vcf` function. 2332 """ 2333 2334 # Config 2335 config = self.get_config() 2336 2337 # Extract VCF 2338 log.debug("Export VCF...") 2339 2340 # Table variants 2341 table_variants = self.get_table_variants() 2342 2343 # Threads 2344 if not threads: 2345 threads = self.get_threads() 2346 2347 # Info fields 2348 if remove_info: 2349 if not isinstance(remove_info, str): 2350 remove_info = "." 2351 info_field = f"""'{remove_info}' as INFO""" 2352 else: 2353 info_field = "INFO" 2354 2355 # Samples fields 2356 if add_samples: 2357 if not list_samples: 2358 list_samples = self.get_header_sample_list() 2359 if list_samples: 2360 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2361 else: 2362 samples_fields = "" 2363 log.debug(f"samples_fields: {samples_fields}") 2364 else: 2365 samples_fields = "" 2366 2367 # Where clause 2368 if where_clause is None: 2369 where_clause = "" 2370 2371 # Variants 2372 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2373 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2374 log.debug(f"sql_query_select={sql_query_select}") 2375 2376 return self.export_output( 2377 output_file=vcf_file, 2378 output_header=None, 2379 export_header=True, 2380 query=sql_query_select, 2381 parquet_partitions=None, 2382 chunk_size=config.get("chunk_size", None), 2383 threads=threads, 2384 sort=True, 2385 index=index, 2386 order_by=None, 2387 ) 2388 2389 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2390 """ 2391 It takes a list of commands and runs them in parallel using the number of threads specified 2392 2393 :param commands: A list of commands to run 2394 :param threads: The number of threads to use, defaults to 1 (optional) 2395 """ 2396 2397 run_parallel_commands(commands, threads) 2398 2399 def get_threads(self, default: int = 1) -> int: 2400 """ 2401 This function returns the number of threads to use for a job, with a default value of 1 if not 2402 specified. 2403 2404 :param default: The `default` parameter in the `get_threads` method is used to specify the 2405 default number of threads to use if no specific value is provided. If no value is provided for 2406 the `threads` parameter in the configuration or input parameters, the `default` value will be 2407 used, defaults to 1 2408 :type default: int (optional) 2409 :return: the number of threads to use for the current job. 2410 """ 2411 2412 # Config 2413 config = self.get_config() 2414 2415 # Param 2416 param = self.get_param() 2417 2418 # Input threads 2419 input_thread = param.get("threads", config.get("threads", None)) 2420 2421 # Check threads 2422 if not input_thread: 2423 threads = default 2424 elif int(input_thread) <= 0: 2425 threads = os.cpu_count() 2426 else: 2427 threads = int(input_thread) 2428 return threads 2429 2430 def get_memory(self, default: str = None) -> str: 2431 """ 2432 This function retrieves the memory value from parameters or configuration with a default value 2433 if not found. 2434 2435 :param default: The `get_memory` function takes in a default value as a string parameter. This 2436 default value is used as a fallback in case the `memory` parameter is not provided in the 2437 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2438 the function 2439 :type default: str 2440 :return: The `get_memory` function returns a string value representing the memory parameter. If 2441 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2442 return the default value provided as an argument to the function. 2443 """ 2444 2445 # Config 2446 config = self.get_config() 2447 2448 # Param 2449 param = self.get_param() 2450 2451 # Input threads 2452 input_memory = param.get("memory", config.get("memory", None)) 2453 2454 # Check threads 2455 if input_memory: 2456 memory = input_memory 2457 else: 2458 memory = default 2459 2460 return memory 2461 2462 def update_from_vcf(self, vcf_file: str) -> None: 2463 """ 2464 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2465 2466 :param vcf_file: the path to the VCF file 2467 """ 2468 2469 connexion_format = self.get_connexion_format() 2470 2471 if connexion_format in ["duckdb"]: 2472 self.update_from_vcf_duckdb(vcf_file) 2473 elif connexion_format in ["sqlite"]: 2474 self.update_from_vcf_sqlite(vcf_file) 2475 2476 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2477 """ 2478 It takes a VCF file and updates the INFO column of the variants table in the database with the 2479 INFO column of the VCF file 2480 2481 :param vcf_file: the path to the VCF file 2482 """ 2483 2484 # varaints table 2485 table_variants = self.get_table_variants() 2486 2487 # Loading VCF into temporaire table 2488 skip = self.get_header_length(file=vcf_file) 2489 vcf_df = pd.read_csv( 2490 vcf_file, 2491 sep="\t", 2492 engine="c", 2493 skiprows=skip, 2494 header=0, 2495 low_memory=False, 2496 ) 2497 sql_query_update = f""" 2498 UPDATE {table_variants} as table_variants 2499 SET INFO = concat( 2500 CASE 2501 WHEN INFO NOT IN ('', '.') 2502 THEN INFO 2503 ELSE '' 2504 END, 2505 ( 2506 SELECT 2507 concat( 2508 CASE 2509 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2510 THEN ';' 2511 ELSE '' 2512 END 2513 , 2514 CASE 2515 WHEN table_parquet.INFO NOT IN ('','.') 2516 THEN table_parquet.INFO 2517 ELSE '' 2518 END 2519 ) 2520 FROM vcf_df as table_parquet 2521 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2522 AND table_parquet.\"POS\" = table_variants.\"POS\" 2523 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2524 AND table_parquet.\"REF\" = table_variants.\"REF\" 2525 AND table_parquet.INFO NOT IN ('','.') 2526 ) 2527 ) 2528 ; 2529 """ 2530 self.conn.execute(sql_query_update) 2531 2532 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2533 """ 2534 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2535 table, then updates the INFO column of the variants table with the INFO column of the temporary 2536 table 2537 2538 :param vcf_file: The path to the VCF file you want to update the database with 2539 """ 2540 2541 # Create a temporary table for the VCF 2542 table_vcf = "tmp_vcf" 2543 sql_create = ( 2544 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2545 ) 2546 self.conn.execute(sql_create) 2547 2548 # Loading VCF into temporaire table 2549 vcf_df = pd.read_csv( 2550 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2551 ) 2552 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2553 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2554 2555 # Update table 'variants' with VCF data 2556 # warning: CONCAT as || operator 2557 sql_query_update = f""" 2558 UPDATE variants as table_variants 2559 SET INFO = CASE 2560 WHEN INFO NOT IN ('', '.') 2561 THEN INFO 2562 ELSE '' 2563 END || 2564 ( 2565 SELECT 2566 CASE 2567 WHEN table_variants.INFO NOT IN ('','.') 2568 AND table_vcf.INFO NOT IN ('','.') 2569 THEN ';' 2570 ELSE '' 2571 END || 2572 CASE 2573 WHEN table_vcf.INFO NOT IN ('','.') 2574 THEN table_vcf.INFO 2575 ELSE '' 2576 END 2577 FROM {table_vcf} as table_vcf 2578 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2579 AND table_vcf.\"POS\" = table_variants.\"POS\" 2580 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2581 AND table_vcf.\"REF\" = table_variants.\"REF\" 2582 ) 2583 """ 2584 self.conn.execute(sql_query_update) 2585 2586 # Drop temporary table 2587 sql_drop = f"DROP TABLE {table_vcf}" 2588 self.conn.execute(sql_drop) 2589 2590 def drop_variants_table(self) -> None: 2591 """ 2592 > This function drops the variants table 2593 """ 2594 2595 table_variants = self.get_table_variants() 2596 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2597 self.conn.execute(sql_table_variants) 2598 2599 def set_variant_id( 2600 self, variant_id_column: str = "variant_id", force: bool = None 2601 ) -> str: 2602 """ 2603 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2604 `#CHROM`, `POS`, `REF`, and `ALT` columns 2605 2606 :param variant_id_column: The name of the column to be created in the variants table, defaults 2607 to variant_id 2608 :type variant_id_column: str (optional) 2609 :param force: If True, the variant_id column will be created even if it already exists 2610 :type force: bool 2611 :return: The name of the column that contains the variant_id 2612 """ 2613 2614 # Assembly 2615 assembly = self.get_param().get( 2616 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2617 ) 2618 2619 # INFO/Tag prefix 2620 prefix = self.get_explode_infos_prefix() 2621 2622 # Explode INFO/SVTYPE 2623 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2624 2625 # variants table 2626 table_variants = self.get_table_variants() 2627 2628 # variant_id column 2629 if not variant_id_column: 2630 variant_id_column = "variant_id" 2631 2632 # Creta variant_id column 2633 if "variant_id" not in self.get_extra_infos() or force: 2634 2635 # Create column 2636 self.add_column( 2637 table_name=table_variants, 2638 column_name=variant_id_column, 2639 column_type="UBIGINT", 2640 default_value="0", 2641 ) 2642 2643 # Update column 2644 self.conn.execute( 2645 f""" 2646 UPDATE {table_variants} 2647 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2648 """ 2649 ) 2650 2651 # Remove added columns 2652 for added_column in added_columns: 2653 self.drop_column(column=added_column) 2654 2655 # return variant_id column name 2656 return variant_id_column 2657 2658 def get_variant_id_column( 2659 self, variant_id_column: str = "variant_id", force: bool = None 2660 ) -> str: 2661 """ 2662 This function returns the variant_id column name 2663 2664 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2665 defaults to variant_id 2666 :type variant_id_column: str (optional) 2667 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2668 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2669 if it is not already set, or if it is set 2670 :type force: bool 2671 :return: The variant_id column name. 2672 """ 2673 2674 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2675 2676 ### 2677 # Annotation 2678 ### 2679 2680 def scan_databases( 2681 self, 2682 database_formats: list = ["parquet"], 2683 database_releases: list = ["current"], 2684 ) -> dict: 2685 """ 2686 The function `scan_databases` scans for available databases based on specified formats and 2687 releases. 2688 2689 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2690 of the databases to be scanned. In this case, the accepted format is "parquet" 2691 :type database_formats: list ["parquet"] 2692 :param database_releases: The `database_releases` parameter is a list that specifies the 2693 releases of the databases to be scanned. In the provided function, the default value for 2694 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2695 databases that are in the "current" 2696 :type database_releases: list 2697 :return: The function `scan_databases` returns a dictionary containing information about 2698 databases that match the specified formats and releases. 2699 """ 2700 2701 # Config 2702 config = self.get_config() 2703 2704 # Param 2705 param = self.get_param() 2706 2707 # Param - Assembly 2708 assembly = param.get("assembly", config.get("assembly", None)) 2709 if not assembly: 2710 assembly = DEFAULT_ASSEMBLY 2711 log.warning(f"Default assembly '{assembly}'") 2712 2713 # Scan for availabled databases 2714 log.info( 2715 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2716 ) 2717 databases_infos_dict = databases_infos( 2718 database_folder_releases=database_releases, 2719 database_formats=database_formats, 2720 assembly=assembly, 2721 config=config, 2722 ) 2723 log.info( 2724 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2725 ) 2726 2727 return databases_infos_dict 2728 2729 def annotation(self) -> None: 2730 """ 2731 It annotates the VCF file with the annotations specified in the config file. 2732 """ 2733 2734 # Config 2735 config = self.get_config() 2736 2737 # Param 2738 param = self.get_param() 2739 2740 # Param - Assembly 2741 assembly = param.get("assembly", config.get("assembly", None)) 2742 if not assembly: 2743 assembly = DEFAULT_ASSEMBLY 2744 log.warning(f"Default assembly '{assembly}'") 2745 2746 # annotations databases folders 2747 annotations_databases = set( 2748 config.get("folders", {}) 2749 .get("databases", {}) 2750 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2751 + config.get("folders", {}) 2752 .get("databases", {}) 2753 .get("parquet", ["~/howard/databases/parquet/current"]) 2754 + config.get("folders", {}) 2755 .get("databases", {}) 2756 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2757 ) 2758 2759 # Get param annotations 2760 if param.get("annotations", None) and isinstance( 2761 param.get("annotations", None), str 2762 ): 2763 log.debug(param.get("annotations", None)) 2764 param_annotation_list = param.get("annotations").split(",") 2765 else: 2766 param_annotation_list = [] 2767 2768 # Each tools param 2769 if param.get("annotation_parquet", None) != None: 2770 log.debug( 2771 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2772 ) 2773 if isinstance(param.get("annotation_parquet", None), list): 2774 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2775 else: 2776 param_annotation_list.append(param.get("annotation_parquet")) 2777 if param.get("annotation_snpsift", None) != None: 2778 if isinstance(param.get("annotation_snpsift", None), list): 2779 param_annotation_list.append( 2780 "snpsift:" 2781 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2782 ) 2783 else: 2784 param_annotation_list.append( 2785 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2786 ) 2787 if param.get("annotation_snpeff", None) != None: 2788 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2789 if param.get("annotation_bcftools", None) != None: 2790 if isinstance(param.get("annotation_bcftools", None), list): 2791 param_annotation_list.append( 2792 "bcftools:" 2793 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2794 ) 2795 else: 2796 param_annotation_list.append( 2797 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2798 ) 2799 if param.get("annotation_annovar", None) != None: 2800 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2801 if param.get("annotation_exomiser", None) != None: 2802 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2803 if param.get("annotation_splice", None) != None: 2804 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2805 2806 # Merge param annotations list 2807 param["annotations"] = ",".join(param_annotation_list) 2808 2809 # debug 2810 log.debug(f"param_annotations={param['annotations']}") 2811 2812 if param.get("annotations"): 2813 2814 # Log 2815 # log.info("Annotations - Check annotation parameters") 2816 2817 if not "annotation" in param: 2818 param["annotation"] = {} 2819 2820 # List of annotations parameters 2821 annotations_list_input = {} 2822 if isinstance(param.get("annotations", None), str): 2823 annotation_file_list = [ 2824 value for value in param.get("annotations", "").split(",") 2825 ] 2826 for annotation_file in annotation_file_list: 2827 annotations_list_input[annotation_file] = {"INFO": None} 2828 else: 2829 annotations_list_input = param.get("annotations", {}) 2830 2831 log.info(f"Quick Annotations:") 2832 for annotation_key in list(annotations_list_input.keys()): 2833 log.info(f" {annotation_key}") 2834 2835 # List of annotations and associated fields 2836 annotations_list = {} 2837 2838 for annotation_file in annotations_list_input: 2839 2840 # Explode annotations if ALL 2841 if ( 2842 annotation_file.upper() == "ALL" 2843 or annotation_file.upper().startswith("ALL:") 2844 ): 2845 2846 # check ALL parameters (formats, releases) 2847 annotation_file_split = annotation_file.split(":") 2848 database_formats = "parquet" 2849 database_releases = "current" 2850 for annotation_file_option in annotation_file_split[1:]: 2851 database_all_options_split = annotation_file_option.split("=") 2852 if database_all_options_split[0] == "format": 2853 database_formats = database_all_options_split[1].split("+") 2854 if database_all_options_split[0] == "release": 2855 database_releases = database_all_options_split[1].split("+") 2856 2857 # Scan for availabled databases 2858 databases_infos_dict = self.scan_databases( 2859 database_formats=database_formats, 2860 database_releases=database_releases, 2861 ) 2862 2863 # Add found databases in annotation parameters 2864 for database_infos in databases_infos_dict.keys(): 2865 annotations_list[database_infos] = {"INFO": None} 2866 2867 else: 2868 annotations_list[annotation_file] = annotations_list_input[ 2869 annotation_file 2870 ] 2871 2872 # Check each databases 2873 if len(annotations_list): 2874 2875 log.info( 2876 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2877 ) 2878 2879 for annotation_file in annotations_list: 2880 2881 # Init 2882 annotations = annotations_list.get(annotation_file, None) 2883 2884 # Annotation snpEff 2885 if annotation_file.startswith("snpeff"): 2886 2887 log.debug(f"Quick Annotation snpEff") 2888 2889 if "snpeff" not in param["annotation"]: 2890 param["annotation"]["snpeff"] = {} 2891 2892 if "options" not in param["annotation"]["snpeff"]: 2893 param["annotation"]["snpeff"]["options"] = "" 2894 2895 # snpEff options in annotations 2896 param["annotation"]["snpeff"]["options"] = "".join( 2897 annotation_file.split(":")[1:] 2898 ) 2899 2900 # Annotation Annovar 2901 elif annotation_file.startswith("annovar"): 2902 2903 log.debug(f"Quick Annotation Annovar") 2904 2905 if "annovar" not in param["annotation"]: 2906 param["annotation"]["annovar"] = {} 2907 2908 if "annotations" not in param["annotation"]["annovar"]: 2909 param["annotation"]["annovar"]["annotations"] = {} 2910 2911 # Options 2912 annotation_file_split = annotation_file.split(":") 2913 for annotation_file_annotation in annotation_file_split[1:]: 2914 if annotation_file_annotation: 2915 param["annotation"]["annovar"]["annotations"][ 2916 annotation_file_annotation 2917 ] = annotations 2918 2919 # Annotation Exomiser 2920 elif annotation_file.startswith("exomiser"): 2921 2922 log.debug(f"Quick Annotation Exomiser") 2923 2924 param["annotation"]["exomiser"] = params_string_to_dict( 2925 annotation_file 2926 ) 2927 2928 # Annotation Splice 2929 elif annotation_file.startswith("splice"): 2930 2931 log.debug(f"Quick Annotation Splice") 2932 2933 param["annotation"]["splice"] = params_string_to_dict( 2934 annotation_file 2935 ) 2936 2937 # Annotation Parquet or BCFTOOLS 2938 else: 2939 2940 # Tools detection 2941 if annotation_file.startswith("bcftools:"): 2942 annotation_tool_initial = "bcftools" 2943 annotation_file = ":".join(annotation_file.split(":")[1:]) 2944 elif annotation_file.startswith("snpsift:"): 2945 annotation_tool_initial = "snpsift" 2946 annotation_file = ":".join(annotation_file.split(":")[1:]) 2947 else: 2948 annotation_tool_initial = None 2949 2950 # list of files 2951 annotation_file_list = annotation_file.replace("+", ":").split( 2952 ":" 2953 ) 2954 2955 for annotation_file in annotation_file_list: 2956 2957 if annotation_file: 2958 2959 # Annotation tool initial 2960 annotation_tool = annotation_tool_initial 2961 2962 # Find file 2963 annotation_file_found = None 2964 2965 # Expand user 2966 annotation_file = full_path(annotation_file) 2967 2968 if os.path.exists(annotation_file): 2969 annotation_file_found = annotation_file 2970 2971 else: 2972 # Find within assembly folders 2973 for annotations_database in annotations_databases: 2974 found_files = find_all( 2975 annotation_file, 2976 os.path.join( 2977 annotations_database, assembly 2978 ), 2979 ) 2980 if len(found_files) > 0: 2981 annotation_file_found = found_files[0] 2982 break 2983 if not annotation_file_found and not assembly: 2984 # Find within folders 2985 for ( 2986 annotations_database 2987 ) in annotations_databases: 2988 found_files = find_all( 2989 annotation_file, annotations_database 2990 ) 2991 if len(found_files) > 0: 2992 annotation_file_found = found_files[0] 2993 break 2994 log.debug( 2995 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2996 ) 2997 2998 # Full path 2999 annotation_file_found = full_path(annotation_file_found) 3000 3001 if annotation_file_found: 3002 3003 database = Database(database=annotation_file_found) 3004 quick_annotation_format = database.get_format() 3005 quick_annotation_is_compressed = ( 3006 database.is_compressed() 3007 ) 3008 quick_annotation_is_indexed = os.path.exists( 3009 f"{annotation_file_found}.tbi" 3010 ) 3011 bcftools_preference = False 3012 3013 # Check Annotation Tool 3014 if not annotation_tool: 3015 if ( 3016 bcftools_preference 3017 and quick_annotation_format 3018 in ["vcf", "bed"] 3019 and quick_annotation_is_compressed 3020 and quick_annotation_is_indexed 3021 ): 3022 annotation_tool = "bcftools" 3023 elif quick_annotation_format in [ 3024 "vcf", 3025 "bed", 3026 "tsv", 3027 "tsv", 3028 "csv", 3029 "json", 3030 "tbl", 3031 "parquet", 3032 "duckdb", 3033 ]: 3034 annotation_tool = "parquet" 3035 else: 3036 log.error( 3037 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3038 ) 3039 raise ValueError( 3040 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3041 ) 3042 3043 log.debug( 3044 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3045 ) 3046 3047 # Annotation Tool dispatch 3048 if annotation_tool: 3049 if annotation_tool not in param["annotation"]: 3050 param["annotation"][annotation_tool] = {} 3051 if ( 3052 "annotations" 3053 not in param["annotation"][annotation_tool] 3054 ): 3055 param["annotation"][annotation_tool][ 3056 "annotations" 3057 ] = {} 3058 param["annotation"][annotation_tool][ 3059 "annotations" 3060 ][annotation_file_found] = annotations 3061 3062 else: 3063 log.error( 3064 f"Quick Annotation File {annotation_file} does NOT exist" 3065 ) 3066 3067 self.set_param(param) 3068 3069 if param.get("annotation", None): 3070 log.info("Annotations") 3071 if param.get("annotation", {}).get("parquet", None): 3072 log.info("Annotations 'parquet'...") 3073 self.annotation_parquet() 3074 if param.get("annotation", {}).get("bcftools", None): 3075 log.info("Annotations 'bcftools'...") 3076 self.annotation_bcftools() 3077 if param.get("annotation", {}).get("snpsift", None): 3078 log.info("Annotations 'snpsift'...") 3079 self.annotation_snpsift() 3080 if param.get("annotation", {}).get("annovar", None): 3081 log.info("Annotations 'annovar'...") 3082 self.annotation_annovar() 3083 if param.get("annotation", {}).get("snpeff", None): 3084 log.info("Annotations 'snpeff'...") 3085 self.annotation_snpeff() 3086 if param.get("annotation", {}).get("exomiser", None) is not None: 3087 log.info("Annotations 'exomiser'...") 3088 self.annotation_exomiser() 3089 if param.get("annotation", {}).get("splice", None) is not None: 3090 log.info("Annotations 'splice' ...") 3091 self.annotation_splice() 3092 3093 # Explode INFOS fields into table fields 3094 if self.get_explode_infos(): 3095 self.explode_infos( 3096 prefix=self.get_explode_infos_prefix(), 3097 fields=self.get_explode_infos_fields(), 3098 force=True, 3099 ) 3100 3101 def annotation_snpsift(self, threads: int = None) -> None: 3102 """ 3103 This function annotate with bcftools 3104 3105 :param threads: Number of threads to use 3106 :return: the value of the variable "return_value". 3107 """ 3108 3109 # DEBUG 3110 log.debug("Start annotation with bcftools databases") 3111 3112 # Threads 3113 if not threads: 3114 threads = self.get_threads() 3115 log.debug("Threads: " + str(threads)) 3116 3117 # Config 3118 config = self.get_config() 3119 log.debug("Config: " + str(config)) 3120 3121 # Config - snpSift 3122 snpsift_bin_command = get_bin_command( 3123 bin="SnpSift.jar", 3124 tool="snpsift", 3125 bin_type="jar", 3126 config=config, 3127 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3128 ) 3129 if not snpsift_bin_command: 3130 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3131 log.error(msg_err) 3132 raise ValueError(msg_err) 3133 3134 # Config - bcftools 3135 bcftools_bin_command = get_bin_command( 3136 bin="bcftools", 3137 tool="bcftools", 3138 bin_type="bin", 3139 config=config, 3140 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3141 ) 3142 if not bcftools_bin_command: 3143 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3144 log.error(msg_err) 3145 raise ValueError(msg_err) 3146 3147 # Config - BCFTools databases folders 3148 databases_folders = set( 3149 self.get_config() 3150 .get("folders", {}) 3151 .get("databases", {}) 3152 .get("annotations", ["."]) 3153 + self.get_config() 3154 .get("folders", {}) 3155 .get("databases", {}) 3156 .get("bcftools", ["."]) 3157 ) 3158 log.debug("Databases annotations: " + str(databases_folders)) 3159 3160 # Param 3161 annotations = ( 3162 self.get_param() 3163 .get("annotation", {}) 3164 .get("snpsift", {}) 3165 .get("annotations", None) 3166 ) 3167 log.debug("Annotations: " + str(annotations)) 3168 3169 # Assembly 3170 assembly = self.get_param().get( 3171 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3172 ) 3173 3174 # Data 3175 table_variants = self.get_table_variants() 3176 3177 # Check if not empty 3178 log.debug("Check if not empty") 3179 sql_query_chromosomes = ( 3180 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3181 ) 3182 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3183 if not sql_query_chromosomes_df["count"][0]: 3184 log.info(f"VCF empty") 3185 return 3186 3187 # VCF header 3188 vcf_reader = self.get_header() 3189 log.debug("Initial header: " + str(vcf_reader.infos)) 3190 3191 # Existing annotations 3192 for vcf_annotation in self.get_header().infos: 3193 3194 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3195 log.debug( 3196 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3197 ) 3198 3199 if annotations: 3200 3201 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3202 3203 # Export VCF file 3204 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3205 3206 # Init 3207 commands = {} 3208 3209 for annotation in annotations: 3210 annotation_fields = annotations[annotation] 3211 3212 # Annotation Name 3213 annotation_name = os.path.basename(annotation) 3214 3215 if not annotation_fields: 3216 annotation_fields = {"INFO": None} 3217 3218 log.debug(f"Annotation '{annotation_name}'") 3219 log.debug( 3220 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3221 ) 3222 3223 # Create Database 3224 database = Database( 3225 database=annotation, 3226 databases_folders=databases_folders, 3227 assembly=assembly, 3228 ) 3229 3230 # Find files 3231 db_file = database.get_database() 3232 db_file = full_path(db_file) 3233 db_hdr_file = database.get_header_file() 3234 db_hdr_file = full_path(db_hdr_file) 3235 db_file_type = database.get_format() 3236 db_tbi_file = f"{db_file}.tbi" 3237 db_file_compressed = database.is_compressed() 3238 3239 # Check if compressed 3240 if not db_file_compressed: 3241 log.error( 3242 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3243 ) 3244 raise ValueError( 3245 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3246 ) 3247 3248 # Check if indexed 3249 if not os.path.exists(db_tbi_file): 3250 log.error( 3251 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3252 ) 3253 raise ValueError( 3254 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3255 ) 3256 3257 # Check index - try to create if not exists 3258 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3259 log.error("Annotation failed: database not valid") 3260 log.error(f"Annotation annotation file: {db_file}") 3261 log.error(f"Annotation annotation header: {db_hdr_file}") 3262 log.error(f"Annotation annotation index: {db_tbi_file}") 3263 raise ValueError( 3264 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3265 ) 3266 else: 3267 3268 log.debug( 3269 f"Annotation '{annotation}' - file: " 3270 + str(db_file) 3271 + " and " 3272 + str(db_hdr_file) 3273 ) 3274 3275 # Load header as VCF object 3276 db_hdr_vcf = Variants(input=db_hdr_file) 3277 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3278 log.debug( 3279 "Annotation database header: " 3280 + str(db_hdr_vcf_header_infos) 3281 ) 3282 3283 # For all fields in database 3284 annotation_fields_full = False 3285 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3286 annotation_fields = { 3287 key: key for key in db_hdr_vcf_header_infos 3288 } 3289 log.debug( 3290 "Annotation database header - All annotations added: " 3291 + str(annotation_fields) 3292 ) 3293 annotation_fields_full = True 3294 3295 # # Create file for field rename 3296 # log.debug("Create file for field rename") 3297 # tmp_rename = NamedTemporaryFile( 3298 # prefix=self.get_prefix(), 3299 # dir=self.get_tmp_dir(), 3300 # suffix=".rename", 3301 # delete=False, 3302 # ) 3303 # tmp_rename_name = tmp_rename.name 3304 # tmp_files.append(tmp_rename_name) 3305 3306 # Number of fields 3307 nb_annotation_field = 0 3308 annotation_list = [] 3309 annotation_infos_rename_list = [] 3310 3311 for annotation_field in annotation_fields: 3312 3313 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3314 annotation_fields_new_name = annotation_fields.get( 3315 annotation_field, annotation_field 3316 ) 3317 if not annotation_fields_new_name: 3318 annotation_fields_new_name = annotation_field 3319 3320 # Check if field is in DB and if field is not elready in input data 3321 if ( 3322 annotation_field in db_hdr_vcf.get_header().infos 3323 and annotation_fields_new_name 3324 not in self.get_header().infos 3325 ): 3326 3327 log.info( 3328 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3329 ) 3330 3331 # BCFTools annotate param to rename fields 3332 if annotation_field != annotation_fields_new_name: 3333 annotation_infos_rename_list.append( 3334 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3335 ) 3336 3337 # Add INFO field to header 3338 db_hdr_vcf_header_infos_number = ( 3339 db_hdr_vcf_header_infos[annotation_field].num or "." 3340 ) 3341 db_hdr_vcf_header_infos_type = ( 3342 db_hdr_vcf_header_infos[annotation_field].type 3343 or "String" 3344 ) 3345 db_hdr_vcf_header_infos_description = ( 3346 db_hdr_vcf_header_infos[annotation_field].desc 3347 or f"{annotation_field} description" 3348 ) 3349 db_hdr_vcf_header_infos_source = ( 3350 db_hdr_vcf_header_infos[annotation_field].source 3351 or "unknown" 3352 ) 3353 db_hdr_vcf_header_infos_version = ( 3354 db_hdr_vcf_header_infos[annotation_field].version 3355 or "unknown" 3356 ) 3357 3358 vcf_reader.infos[annotation_fields_new_name] = ( 3359 vcf.parser._Info( 3360 annotation_fields_new_name, 3361 db_hdr_vcf_header_infos_number, 3362 db_hdr_vcf_header_infos_type, 3363 db_hdr_vcf_header_infos_description, 3364 db_hdr_vcf_header_infos_source, 3365 db_hdr_vcf_header_infos_version, 3366 self.code_type_map[ 3367 db_hdr_vcf_header_infos_type 3368 ], 3369 ) 3370 ) 3371 3372 annotation_list.append(annotation_field) 3373 3374 nb_annotation_field += 1 3375 3376 else: 3377 3378 if ( 3379 annotation_field 3380 not in db_hdr_vcf.get_header().infos 3381 ): 3382 log.warning( 3383 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3384 ) 3385 if ( 3386 annotation_fields_new_name 3387 in self.get_header().infos 3388 ): 3389 log.warning( 3390 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3391 ) 3392 3393 log.info( 3394 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3395 ) 3396 3397 annotation_infos = ",".join(annotation_list) 3398 3399 if annotation_infos != "": 3400 3401 # Annotated VCF (and error file) 3402 tmp_annotation_vcf_name = os.path.join( 3403 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3404 ) 3405 tmp_annotation_vcf_name_err = ( 3406 tmp_annotation_vcf_name + ".err" 3407 ) 3408 3409 # Add fields to annotate 3410 if not annotation_fields_full: 3411 annotation_infos_option = f"-info {annotation_infos}" 3412 else: 3413 annotation_infos_option = "" 3414 3415 # Info fields rename 3416 if annotation_infos_rename_list: 3417 annotation_infos_rename = " -c " + ",".join( 3418 annotation_infos_rename_list 3419 ) 3420 else: 3421 annotation_infos_rename = "" 3422 3423 # Annotate command 3424 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3425 3426 # Add command 3427 commands[command_annotate] = tmp_annotation_vcf_name 3428 3429 if commands: 3430 3431 # Export VCF file 3432 self.export_variant_vcf( 3433 vcf_file=tmp_vcf_name, 3434 remove_info=True, 3435 add_samples=False, 3436 index=True, 3437 ) 3438 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3439 3440 # Num command 3441 nb_command = 0 3442 3443 # Annotate 3444 for command_annotate in commands: 3445 nb_command += 1 3446 log.info( 3447 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3448 ) 3449 log.debug(f"command_annotate={command_annotate}") 3450 run_parallel_commands([command_annotate], threads) 3451 3452 # Debug 3453 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3454 3455 # Update variants 3456 log.info( 3457 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3458 ) 3459 self.update_from_vcf(commands[command_annotate]) 3460 3461 def annotation_bcftools(self, threads: int = None) -> None: 3462 """ 3463 This function annotate with bcftools 3464 3465 :param threads: Number of threads to use 3466 :return: the value of the variable "return_value". 3467 """ 3468 3469 # DEBUG 3470 log.debug("Start annotation with bcftools databases") 3471 3472 # Threads 3473 if not threads: 3474 threads = self.get_threads() 3475 log.debug("Threads: " + str(threads)) 3476 3477 # Config 3478 config = self.get_config() 3479 log.debug("Config: " + str(config)) 3480 3481 # DEBUG 3482 delete_tmp = True 3483 if self.get_config().get("verbosity", "warning") in ["debug"]: 3484 delete_tmp = False 3485 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3486 3487 # Config - BCFTools bin command 3488 bcftools_bin_command = get_bin_command( 3489 bin="bcftools", 3490 tool="bcftools", 3491 bin_type="bin", 3492 config=config, 3493 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3494 ) 3495 if not bcftools_bin_command: 3496 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3497 log.error(msg_err) 3498 raise ValueError(msg_err) 3499 3500 # Config - BCFTools databases folders 3501 databases_folders = set( 3502 self.get_config() 3503 .get("folders", {}) 3504 .get("databases", {}) 3505 .get("annotations", ["."]) 3506 + self.get_config() 3507 .get("folders", {}) 3508 .get("databases", {}) 3509 .get("bcftools", ["."]) 3510 ) 3511 log.debug("Databases annotations: " + str(databases_folders)) 3512 3513 # Param 3514 annotations = ( 3515 self.get_param() 3516 .get("annotation", {}) 3517 .get("bcftools", {}) 3518 .get("annotations", None) 3519 ) 3520 log.debug("Annotations: " + str(annotations)) 3521 3522 # Assembly 3523 assembly = self.get_param().get( 3524 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3525 ) 3526 3527 # Data 3528 table_variants = self.get_table_variants() 3529 3530 # Check if not empty 3531 log.debug("Check if not empty") 3532 sql_query_chromosomes = ( 3533 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3534 ) 3535 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3536 if not sql_query_chromosomes_df["count"][0]: 3537 log.info(f"VCF empty") 3538 return 3539 3540 # Export in VCF 3541 log.debug("Create initial file to annotate") 3542 tmp_vcf = NamedTemporaryFile( 3543 prefix=self.get_prefix(), 3544 dir=self.get_tmp_dir(), 3545 suffix=".vcf.gz", 3546 delete=False, 3547 ) 3548 tmp_vcf_name = tmp_vcf.name 3549 3550 # VCF header 3551 vcf_reader = self.get_header() 3552 log.debug("Initial header: " + str(vcf_reader.infos)) 3553 3554 # Existing annotations 3555 for vcf_annotation in self.get_header().infos: 3556 3557 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3558 log.debug( 3559 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3560 ) 3561 3562 if annotations: 3563 3564 tmp_ann_vcf_list = [] 3565 commands = [] 3566 tmp_files = [] 3567 err_files = [] 3568 3569 for annotation in annotations: 3570 annotation_fields = annotations[annotation] 3571 3572 # Annotation Name 3573 annotation_name = os.path.basename(annotation) 3574 3575 if not annotation_fields: 3576 annotation_fields = {"INFO": None} 3577 3578 log.debug(f"Annotation '{annotation_name}'") 3579 log.debug( 3580 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3581 ) 3582 3583 # Create Database 3584 database = Database( 3585 database=annotation, 3586 databases_folders=databases_folders, 3587 assembly=assembly, 3588 ) 3589 3590 # Find files 3591 db_file = database.get_database() 3592 db_file = full_path(db_file) 3593 db_hdr_file = database.get_header_file() 3594 db_hdr_file = full_path(db_hdr_file) 3595 db_file_type = database.get_format() 3596 db_tbi_file = f"{db_file}.tbi" 3597 db_file_compressed = database.is_compressed() 3598 3599 # Check if compressed 3600 if not db_file_compressed: 3601 log.error( 3602 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3603 ) 3604 raise ValueError( 3605 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3606 ) 3607 3608 # Check if indexed 3609 if not os.path.exists(db_tbi_file): 3610 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3611 raise ValueError( 3612 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3613 ) 3614 3615 # Check index - try to create if not exists 3616 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3617 log.error("Annotation failed: database not valid") 3618 log.error(f"Annotation annotation file: {db_file}") 3619 log.error(f"Annotation annotation header: {db_hdr_file}") 3620 log.error(f"Annotation annotation index: {db_tbi_file}") 3621 raise ValueError( 3622 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3623 ) 3624 else: 3625 3626 log.debug( 3627 f"Annotation '{annotation}' - file: " 3628 + str(db_file) 3629 + " and " 3630 + str(db_hdr_file) 3631 ) 3632 3633 # Load header as VCF object 3634 db_hdr_vcf = Variants(input=db_hdr_file) 3635 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3636 log.debug( 3637 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3638 ) 3639 3640 # For all fields in database 3641 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3642 annotation_fields = { 3643 key: key for key in db_hdr_vcf_header_infos 3644 } 3645 log.debug( 3646 "Annotation database header - All annotations added: " 3647 + str(annotation_fields) 3648 ) 3649 3650 # Number of fields 3651 nb_annotation_field = 0 3652 annotation_list = [] 3653 3654 for annotation_field in annotation_fields: 3655 3656 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3657 annotation_fields_new_name = annotation_fields.get( 3658 annotation_field, annotation_field 3659 ) 3660 if not annotation_fields_new_name: 3661 annotation_fields_new_name = annotation_field 3662 3663 # Check if field is in DB and if field is not elready in input data 3664 if ( 3665 annotation_field in db_hdr_vcf.get_header().infos 3666 and annotation_fields_new_name 3667 not in self.get_header().infos 3668 ): 3669 3670 log.info( 3671 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3672 ) 3673 3674 # Add INFO field to header 3675 db_hdr_vcf_header_infos_number = ( 3676 db_hdr_vcf_header_infos[annotation_field].num or "." 3677 ) 3678 db_hdr_vcf_header_infos_type = ( 3679 db_hdr_vcf_header_infos[annotation_field].type 3680 or "String" 3681 ) 3682 db_hdr_vcf_header_infos_description = ( 3683 db_hdr_vcf_header_infos[annotation_field].desc 3684 or f"{annotation_field} description" 3685 ) 3686 db_hdr_vcf_header_infos_source = ( 3687 db_hdr_vcf_header_infos[annotation_field].source 3688 or "unknown" 3689 ) 3690 db_hdr_vcf_header_infos_version = ( 3691 db_hdr_vcf_header_infos[annotation_field].version 3692 or "unknown" 3693 ) 3694 3695 vcf_reader.infos[annotation_fields_new_name] = ( 3696 vcf.parser._Info( 3697 annotation_fields_new_name, 3698 db_hdr_vcf_header_infos_number, 3699 db_hdr_vcf_header_infos_type, 3700 db_hdr_vcf_header_infos_description, 3701 db_hdr_vcf_header_infos_source, 3702 db_hdr_vcf_header_infos_version, 3703 self.code_type_map[db_hdr_vcf_header_infos_type], 3704 ) 3705 ) 3706 3707 # annotation_list.append(annotation_field) 3708 if annotation_field != annotation_fields_new_name: 3709 annotation_list.append( 3710 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3711 ) 3712 else: 3713 annotation_list.append(annotation_field) 3714 3715 nb_annotation_field += 1 3716 3717 else: 3718 3719 if annotation_field not in db_hdr_vcf.get_header().infos: 3720 log.warning( 3721 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3722 ) 3723 if annotation_fields_new_name in self.get_header().infos: 3724 log.warning( 3725 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3726 ) 3727 3728 log.info( 3729 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3730 ) 3731 3732 annotation_infos = ",".join(annotation_list) 3733 3734 if annotation_infos != "": 3735 3736 # Protect header for bcftools (remove "#CHROM" and variants line) 3737 log.debug("Protect Header file - remove #CHROM line if exists") 3738 tmp_header_vcf = NamedTemporaryFile( 3739 prefix=self.get_prefix(), 3740 dir=self.get_tmp_dir(), 3741 suffix=".hdr", 3742 delete=False, 3743 ) 3744 tmp_header_vcf_name = tmp_header_vcf.name 3745 tmp_files.append(tmp_header_vcf_name) 3746 # Command 3747 if db_hdr_file.endswith(".gz"): 3748 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3749 else: 3750 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3751 # Run 3752 run_parallel_commands([command_extract_header], 1) 3753 3754 # Find chomosomes 3755 log.debug("Find chromosomes ") 3756 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3757 sql_query_chromosomes_df = self.get_query_to_df( 3758 sql_query_chromosomes 3759 ) 3760 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3761 3762 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3763 3764 # BED columns in the annotation file 3765 if db_file_type in ["bed"]: 3766 annotation_infos = "CHROM,POS,POS," + annotation_infos 3767 3768 for chrom in chomosomes_list: 3769 3770 # Create BED on initial VCF 3771 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3772 tmp_bed = NamedTemporaryFile( 3773 prefix=self.get_prefix(), 3774 dir=self.get_tmp_dir(), 3775 suffix=".bed", 3776 delete=False, 3777 ) 3778 tmp_bed_name = tmp_bed.name 3779 tmp_files.append(tmp_bed_name) 3780 3781 # Detecte regions 3782 log.debug( 3783 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3784 ) 3785 window = 1000000 3786 sql_query_intervals_for_bed = f""" 3787 SELECT \"#CHROM\", 3788 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3789 \"POS\"+{window} 3790 FROM {table_variants} as table_variants 3791 WHERE table_variants.\"#CHROM\" = '{chrom}' 3792 """ 3793 regions = self.conn.execute( 3794 sql_query_intervals_for_bed 3795 ).fetchall() 3796 merged_regions = merge_regions(regions) 3797 log.debug( 3798 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3799 ) 3800 3801 header = ["#CHROM", "START", "END"] 3802 with open(tmp_bed_name, "w") as f: 3803 # Write the header with tab delimiter 3804 f.write("\t".join(header) + "\n") 3805 for d in merged_regions: 3806 # Write each data row with tab delimiter 3807 f.write("\t".join(map(str, d)) + "\n") 3808 3809 # Tmp files 3810 tmp_annotation_vcf = NamedTemporaryFile( 3811 prefix=self.get_prefix(), 3812 dir=self.get_tmp_dir(), 3813 suffix=".vcf.gz", 3814 delete=False, 3815 ) 3816 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3817 tmp_files.append(tmp_annotation_vcf_name) 3818 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3819 tmp_annotation_vcf_name_err = ( 3820 tmp_annotation_vcf_name + ".err" 3821 ) 3822 err_files.append(tmp_annotation_vcf_name_err) 3823 3824 # Annotate Command 3825 log.debug( 3826 f"Annotation '{annotation}' - add bcftools command" 3827 ) 3828 3829 # Command 3830 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3831 3832 # Add command 3833 commands.append(command_annotate) 3834 3835 # if some commands 3836 if commands: 3837 3838 # Export VCF file 3839 self.export_variant_vcf( 3840 vcf_file=tmp_vcf_name, 3841 remove_info=True, 3842 add_samples=False, 3843 index=True, 3844 ) 3845 3846 # Threads 3847 # calculate threads for annotated commands 3848 if commands: 3849 threads_bcftools_annotate = round(threads / len(commands)) 3850 else: 3851 threads_bcftools_annotate = 1 3852 3853 if not threads_bcftools_annotate: 3854 threads_bcftools_annotate = 1 3855 3856 # Add threads option to bcftools commands 3857 if threads_bcftools_annotate > 1: 3858 commands_threaded = [] 3859 for command in commands: 3860 commands_threaded.append( 3861 command.replace( 3862 f"{bcftools_bin_command} annotate ", 3863 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3864 ) 3865 ) 3866 commands = commands_threaded 3867 3868 # Command annotation multithreading 3869 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3870 log.info( 3871 f"Annotation - Annotation multithreaded in " 3872 + str(len(commands)) 3873 + " commands" 3874 ) 3875 3876 run_parallel_commands(commands, threads) 3877 3878 # Merge 3879 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3880 3881 if tmp_ann_vcf_list_cmd: 3882 3883 # Tmp file 3884 tmp_annotate_vcf = NamedTemporaryFile( 3885 prefix=self.get_prefix(), 3886 dir=self.get_tmp_dir(), 3887 suffix=".vcf.gz", 3888 delete=True, 3889 ) 3890 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3891 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3892 err_files.append(tmp_annotate_vcf_name_err) 3893 3894 # Tmp file remove command 3895 tmp_files_remove_command = "" 3896 if tmp_files: 3897 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3898 3899 # Command merge 3900 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3901 log.info( 3902 f"Annotation - Annotation merging " 3903 + str(len(commands)) 3904 + " annotated files" 3905 ) 3906 log.debug(f"Annotation - merge command: {merge_command}") 3907 run_parallel_commands([merge_command], 1) 3908 3909 # Error messages 3910 log.info(f"Error/Warning messages:") 3911 error_message_command_all = [] 3912 error_message_command_warning = [] 3913 error_message_command_err = [] 3914 for err_file in err_files: 3915 with open(err_file, "r") as f: 3916 for line in f: 3917 message = line.strip() 3918 error_message_command_all.append(message) 3919 if line.startswith("[W::"): 3920 error_message_command_warning.append(message) 3921 if line.startswith("[E::"): 3922 error_message_command_err.append( 3923 f"{err_file}: " + message 3924 ) 3925 # log info 3926 for message in list( 3927 set(error_message_command_err + error_message_command_warning) 3928 ): 3929 log.info(f" {message}") 3930 # debug info 3931 for message in list(set(error_message_command_all)): 3932 log.debug(f" {message}") 3933 # failed 3934 if len(error_message_command_err): 3935 log.error("Annotation failed: Error in commands") 3936 raise ValueError("Annotation failed: Error in commands") 3937 3938 # Update variants 3939 log.info(f"Annotation - Updating...") 3940 self.update_from_vcf(tmp_annotate_vcf_name) 3941 3942 def annotation_exomiser(self, threads: int = None) -> None: 3943 """ 3944 This function annotate with Exomiser 3945 3946 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3947 - "analysis" (dict/file): 3948 Full analysis dictionnary parameters (see Exomiser docs). 3949 Either a dict, or a file in JSON or YAML format. 3950 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3951 Default : None 3952 - "preset" (string): 3953 Analysis preset (available in config folder). 3954 Used if no full "analysis" is provided. 3955 Default: "exome" 3956 - "phenopacket" (dict/file): 3957 Samples and phenotipic features parameters (see Exomiser docs). 3958 Either a dict, or a file in JSON or YAML format. 3959 Default: None 3960 - "subject" (dict): 3961 Sample parameters (see Exomiser docs). 3962 Example: 3963 "subject": 3964 { 3965 "id": "ISDBM322017", 3966 "sex": "FEMALE" 3967 } 3968 Default: None 3969 - "sample" (string): 3970 Sample name to construct "subject" section: 3971 "subject": 3972 { 3973 "id": "<sample>", 3974 "sex": "UNKNOWN_SEX" 3975 } 3976 Default: None 3977 - "phenotypicFeatures" (dict) 3978 Phenotypic features to construct "subject" section. 3979 Example: 3980 "phenotypicFeatures": 3981 [ 3982 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3983 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3984 ] 3985 - "hpo" (list) 3986 List of HPO ids as phenotypic features. 3987 Example: 3988 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3989 Default: [] 3990 - "outputOptions" (dict): 3991 Output options (see Exomiser docs). 3992 Default: 3993 "output_options" = 3994 { 3995 "outputContributingVariantsOnly": False, 3996 "numGenes": 0, 3997 "outputFormats": ["TSV_VARIANT", "VCF"] 3998 } 3999 - "transcript_source" (string): 4000 Transcript source (either "refseq", "ucsc", "ensembl") 4001 Default: "refseq" 4002 - "exomiser_to_info" (boolean): 4003 Add exomiser TSV file columns as INFO fields in VCF. 4004 Default: False 4005 - "release" (string): 4006 Exomise database release. 4007 If not exists, database release will be downloaded (take a while). 4008 Default: None (provided by application.properties configuration file) 4009 - "exomiser_application_properties" (file): 4010 Exomiser configuration file (see Exomiser docs). 4011 Useful to automatically download databases (especially for specific genome databases). 4012 4013 Notes: 4014 - If no sample in parameters, first sample in VCF will be chosen 4015 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4016 4017 :param threads: The number of threads to use 4018 :return: None. 4019 """ 4020 4021 # DEBUG 4022 log.debug("Start annotation with Exomiser databases") 4023 4024 # Threads 4025 if not threads: 4026 threads = self.get_threads() 4027 log.debug("Threads: " + str(threads)) 4028 4029 # Config 4030 config = self.get_config() 4031 log.debug("Config: " + str(config)) 4032 4033 # Config - Folders - Databases 4034 databases_folders = ( 4035 config.get("folders", {}) 4036 .get("databases", {}) 4037 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4038 ) 4039 databases_folders = full_path(databases_folders) 4040 if not os.path.exists(databases_folders): 4041 log.error(f"Databases annotations: {databases_folders} NOT found") 4042 log.debug("Databases annotations: " + str(databases_folders)) 4043 4044 # Config - Exomiser 4045 exomiser_bin_command = get_bin_command( 4046 bin="exomiser-cli*.jar", 4047 tool="exomiser", 4048 bin_type="jar", 4049 config=config, 4050 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4051 ) 4052 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4053 if not exomiser_bin_command: 4054 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4055 log.error(msg_err) 4056 raise ValueError(msg_err) 4057 4058 # Param 4059 param = self.get_param() 4060 log.debug("Param: " + str(param)) 4061 4062 # Param - Exomiser 4063 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4064 log.debug(f"Param Exomiser: {param_exomiser}") 4065 4066 # Param - Assembly 4067 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4068 log.debug("Assembly: " + str(assembly)) 4069 4070 # Data 4071 table_variants = self.get_table_variants() 4072 4073 # Check if not empty 4074 log.debug("Check if not empty") 4075 sql_query_chromosomes = ( 4076 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4077 ) 4078 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4079 log.info(f"VCF empty") 4080 return False 4081 4082 # VCF header 4083 vcf_reader = self.get_header() 4084 log.debug("Initial header: " + str(vcf_reader.infos)) 4085 4086 # Samples 4087 samples = self.get_header_sample_list() 4088 if not samples: 4089 log.error("No Samples in VCF") 4090 return False 4091 log.debug(f"Samples: {samples}") 4092 4093 # Memory limit 4094 memory_limit = self.get_memory("8G") 4095 log.debug(f"memory_limit: {memory_limit}") 4096 4097 # Exomiser java options 4098 exomiser_java_options = ( 4099 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4100 ) 4101 log.debug(f"Exomiser java options: {exomiser_java_options}") 4102 4103 # Download Exomiser (if not exists) 4104 exomiser_release = param_exomiser.get("release", None) 4105 exomiser_application_properties = param_exomiser.get( 4106 "exomiser_application_properties", None 4107 ) 4108 databases_download_exomiser( 4109 assemblies=[assembly], 4110 exomiser_folder=databases_folders, 4111 exomiser_release=exomiser_release, 4112 exomiser_phenotype_release=exomiser_release, 4113 exomiser_application_properties=exomiser_application_properties, 4114 ) 4115 4116 # Force annotation 4117 force_update_annotation = True 4118 4119 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4120 log.debug("Start annotation Exomiser") 4121 4122 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4123 4124 # tmp_dir = "/tmp/exomiser" 4125 4126 ### ANALYSIS ### 4127 ################ 4128 4129 # Create analysis.json through analysis dict 4130 # either analysis in param or by default 4131 # depending on preset exome/genome) 4132 4133 # Init analysis dict 4134 param_exomiser_analysis_dict = {} 4135 4136 # analysis from param 4137 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4138 param_exomiser_analysis = full_path(param_exomiser_analysis) 4139 4140 # If analysis in param -> load anlaysis json 4141 if param_exomiser_analysis: 4142 4143 # If param analysis is a file and exists 4144 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4145 param_exomiser_analysis 4146 ): 4147 # Load analysis file into analysis dict (either yaml or json) 4148 with open(param_exomiser_analysis) as json_file: 4149 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4150 4151 # If param analysis is a dict 4152 elif isinstance(param_exomiser_analysis, dict): 4153 # Load analysis dict into analysis dict (either yaml or json) 4154 param_exomiser_analysis_dict = param_exomiser_analysis 4155 4156 # Error analysis type 4157 else: 4158 log.error(f"Analysis type unknown. Check param file.") 4159 raise ValueError(f"Analysis type unknown. Check param file.") 4160 4161 # Case no input analysis config file/dict 4162 # Use preset (exome/genome) to open default config file 4163 if not param_exomiser_analysis_dict: 4164 4165 # default preset 4166 default_preset = "exome" 4167 4168 # Get param preset or default preset 4169 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4170 4171 # Try to find if preset is a file 4172 if os.path.exists(param_exomiser_preset): 4173 # Preset file is provided in full path 4174 param_exomiser_analysis_default_config_file = ( 4175 param_exomiser_preset 4176 ) 4177 # elif os.path.exists(full_path(param_exomiser_preset)): 4178 # # Preset file is provided in full path 4179 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4180 elif os.path.exists( 4181 os.path.join(folder_config, param_exomiser_preset) 4182 ): 4183 # Preset file is provided a basename in config folder (can be a path with subfolders) 4184 param_exomiser_analysis_default_config_file = os.path.join( 4185 folder_config, param_exomiser_preset 4186 ) 4187 else: 4188 # Construct preset file 4189 param_exomiser_analysis_default_config_file = os.path.join( 4190 folder_config, 4191 f"preset-{param_exomiser_preset}-analysis.json", 4192 ) 4193 4194 # If preset file exists 4195 param_exomiser_analysis_default_config_file = full_path( 4196 param_exomiser_analysis_default_config_file 4197 ) 4198 if os.path.exists(param_exomiser_analysis_default_config_file): 4199 # Load prest file into analysis dict (either yaml or json) 4200 with open( 4201 param_exomiser_analysis_default_config_file 4202 ) as json_file: 4203 # param_exomiser_analysis_dict[""] = json.load(json_file) 4204 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4205 json_file 4206 ) 4207 4208 # Error preset file 4209 else: 4210 log.error( 4211 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4212 ) 4213 raise ValueError( 4214 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4215 ) 4216 4217 # If no analysis dict created 4218 if not param_exomiser_analysis_dict: 4219 log.error(f"No analysis config") 4220 raise ValueError(f"No analysis config") 4221 4222 # Log 4223 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4224 4225 ### PHENOPACKET ### 4226 ################### 4227 4228 # If no PhenoPacket in analysis dict -> check in param 4229 if "phenopacket" not in param_exomiser_analysis_dict: 4230 4231 # If PhenoPacket in param -> load anlaysis json 4232 if param_exomiser.get("phenopacket", None): 4233 4234 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4235 param_exomiser_phenopacket = full_path( 4236 param_exomiser_phenopacket 4237 ) 4238 4239 # If param phenopacket is a file and exists 4240 if isinstance( 4241 param_exomiser_phenopacket, str 4242 ) and os.path.exists(param_exomiser_phenopacket): 4243 # Load phenopacket file into analysis dict (either yaml or json) 4244 with open(param_exomiser_phenopacket) as json_file: 4245 param_exomiser_analysis_dict["phenopacket"] = ( 4246 yaml.safe_load(json_file) 4247 ) 4248 4249 # If param phenopacket is a dict 4250 elif isinstance(param_exomiser_phenopacket, dict): 4251 # Load phenopacket dict into analysis dict (either yaml or json) 4252 param_exomiser_analysis_dict["phenopacket"] = ( 4253 param_exomiser_phenopacket 4254 ) 4255 4256 # Error phenopacket type 4257 else: 4258 log.error(f"Phenopacket type unknown. Check param file.") 4259 raise ValueError( 4260 f"Phenopacket type unknown. Check param file." 4261 ) 4262 4263 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4264 if "phenopacket" not in param_exomiser_analysis_dict: 4265 4266 # Init PhenoPacket 4267 param_exomiser_analysis_dict["phenopacket"] = { 4268 "id": "analysis", 4269 "proband": {}, 4270 } 4271 4272 ### Add subject ### 4273 4274 # If subject exists 4275 param_exomiser_subject = param_exomiser.get("subject", {}) 4276 4277 # If subject not exists -> found sample ID 4278 if not param_exomiser_subject: 4279 4280 # Found sample ID in param 4281 sample = param_exomiser.get("sample", None) 4282 4283 # Find sample ID (first sample) 4284 if not sample: 4285 sample_list = self.get_header_sample_list() 4286 if len(sample_list) > 0: 4287 sample = sample_list[0] 4288 else: 4289 log.error(f"No sample found") 4290 raise ValueError(f"No sample found") 4291 4292 # Create subject 4293 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4294 4295 # Add to dict 4296 param_exomiser_analysis_dict["phenopacket"][ 4297 "subject" 4298 ] = param_exomiser_subject 4299 4300 ### Add "phenotypicFeatures" ### 4301 4302 # If phenotypicFeatures exists 4303 param_exomiser_phenotypicfeatures = param_exomiser.get( 4304 "phenotypicFeatures", [] 4305 ) 4306 4307 # If phenotypicFeatures not exists -> Try to infer from hpo list 4308 if not param_exomiser_phenotypicfeatures: 4309 4310 # Found HPO in param 4311 param_exomiser_hpo = param_exomiser.get("hpo", []) 4312 4313 # Split HPO if list in string format separated by comma 4314 if isinstance(param_exomiser_hpo, str): 4315 param_exomiser_hpo = param_exomiser_hpo.split(",") 4316 4317 # Create HPO list 4318 for hpo in param_exomiser_hpo: 4319 hpo_clean = re.sub("[^0-9]", "", hpo) 4320 param_exomiser_phenotypicfeatures.append( 4321 { 4322 "type": { 4323 "id": f"HP:{hpo_clean}", 4324 "label": f"HP:{hpo_clean}", 4325 } 4326 } 4327 ) 4328 4329 # Add to dict 4330 param_exomiser_analysis_dict["phenopacket"][ 4331 "phenotypicFeatures" 4332 ] = param_exomiser_phenotypicfeatures 4333 4334 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4335 if not param_exomiser_phenotypicfeatures: 4336 for step in param_exomiser_analysis_dict.get( 4337 "analysis", {} 4338 ).get("steps", []): 4339 if "hiPhivePrioritiser" in step: 4340 param_exomiser_analysis_dict.get("analysis", {}).get( 4341 "steps", [] 4342 ).remove(step) 4343 4344 ### Add Input File ### 4345 4346 # Initial file name and htsFiles 4347 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4348 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4349 { 4350 "uri": tmp_vcf_name, 4351 "htsFormat": "VCF", 4352 "genomeAssembly": assembly, 4353 } 4354 ] 4355 4356 ### Add metaData ### 4357 4358 # If metaData not in analysis dict 4359 if "metaData" not in param_exomiser_analysis_dict: 4360 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4361 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4362 "createdBy": "howard", 4363 "phenopacketSchemaVersion": 1, 4364 } 4365 4366 ### OutputOptions ### 4367 4368 # Init output result folder 4369 output_results = os.path.join(tmp_dir, "results") 4370 4371 # If no outputOptions in analysis dict 4372 if "outputOptions" not in param_exomiser_analysis_dict: 4373 4374 # default output formats 4375 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4376 4377 # Get outputOptions in param 4378 output_options = param_exomiser.get("outputOptions", None) 4379 4380 # If no output_options in param -> check 4381 if not output_options: 4382 output_options = { 4383 "outputContributingVariantsOnly": False, 4384 "numGenes": 0, 4385 "outputFormats": defaut_output_formats, 4386 } 4387 4388 # Replace outputDirectory in output options 4389 output_options["outputDirectory"] = output_results 4390 output_options["outputFileName"] = "howard" 4391 4392 # Add outputOptions in analysis dict 4393 param_exomiser_analysis_dict["outputOptions"] = output_options 4394 4395 else: 4396 4397 # Replace output_results and output format (if exists in param) 4398 param_exomiser_analysis_dict["outputOptions"][ 4399 "outputDirectory" 4400 ] = output_results 4401 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4402 list( 4403 set( 4404 param_exomiser_analysis_dict.get( 4405 "outputOptions", {} 4406 ).get("outputFormats", []) 4407 + ["TSV_VARIANT", "VCF"] 4408 ) 4409 ) 4410 ) 4411 4412 # log 4413 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4414 4415 ### ANALYSIS FILE ### 4416 ##################### 4417 4418 ### Full JSON analysis config file ### 4419 4420 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4421 with open(exomiser_analysis, "w") as fp: 4422 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4423 4424 ### SPLIT analysis and sample config files 4425 4426 # Splitted analysis dict 4427 param_exomiser_analysis_dict_for_split = ( 4428 param_exomiser_analysis_dict.copy() 4429 ) 4430 4431 # Phenopacket JSON file 4432 exomiser_analysis_phenopacket = os.path.join( 4433 tmp_dir, "analysis_phenopacket.json" 4434 ) 4435 with open(exomiser_analysis_phenopacket, "w") as fp: 4436 json.dump( 4437 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4438 fp, 4439 indent=4, 4440 ) 4441 4442 # Analysis JSON file without Phenopacket parameters 4443 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4444 exomiser_analysis_analysis = os.path.join( 4445 tmp_dir, "analysis_analysis.json" 4446 ) 4447 with open(exomiser_analysis_analysis, "w") as fp: 4448 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4449 4450 ### INITAL VCF file ### 4451 ####################### 4452 4453 ### Create list of samples to use and include inti initial VCF file #### 4454 4455 # Subject (main sample) 4456 # Get sample ID in analysis dict 4457 sample_subject = ( 4458 param_exomiser_analysis_dict.get("phenopacket", {}) 4459 .get("subject", {}) 4460 .get("id", None) 4461 ) 4462 sample_proband = ( 4463 param_exomiser_analysis_dict.get("phenopacket", {}) 4464 .get("proband", {}) 4465 .get("subject", {}) 4466 .get("id", None) 4467 ) 4468 sample = [] 4469 if sample_subject: 4470 sample.append(sample_subject) 4471 if sample_proband: 4472 sample.append(sample_proband) 4473 4474 # Get sample ID within Pedigree 4475 pedigree_persons_list = ( 4476 param_exomiser_analysis_dict.get("phenopacket", {}) 4477 .get("pedigree", {}) 4478 .get("persons", {}) 4479 ) 4480 4481 # Create list with all sample ID in pedigree (if exists) 4482 pedigree_persons = [] 4483 for person in pedigree_persons_list: 4484 pedigree_persons.append(person.get("individualId")) 4485 4486 # Concat subject sample ID and samples ID in pedigreesamples 4487 samples = list(set(sample + pedigree_persons)) 4488 4489 # Check if sample list is not empty 4490 if not samples: 4491 log.error(f"No samples found") 4492 raise ValueError(f"No samples found") 4493 4494 # Create VCF with sample (either sample in param or first one by default) 4495 # Export VCF file 4496 self.export_variant_vcf( 4497 vcf_file=tmp_vcf_name, 4498 remove_info=True, 4499 add_samples=True, 4500 list_samples=samples, 4501 index=False, 4502 ) 4503 4504 ### Execute Exomiser ### 4505 ######################## 4506 4507 # Init command 4508 exomiser_command = "" 4509 4510 # Command exomiser options 4511 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4512 4513 # Release 4514 exomiser_release = param_exomiser.get("release", None) 4515 if exomiser_release: 4516 # phenotype data version 4517 exomiser_options += ( 4518 f" --exomiser.phenotype.data-version={exomiser_release} " 4519 ) 4520 # data version 4521 exomiser_options += ( 4522 f" --exomiser.{assembly}.data-version={exomiser_release} " 4523 ) 4524 # variant white list 4525 variant_white_list_file = ( 4526 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4527 ) 4528 if os.path.exists( 4529 os.path.join( 4530 databases_folders, assembly, variant_white_list_file 4531 ) 4532 ): 4533 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4534 4535 # transcript_source 4536 transcript_source = param_exomiser.get( 4537 "transcript_source", None 4538 ) # ucsc, refseq, ensembl 4539 if transcript_source: 4540 exomiser_options += ( 4541 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4542 ) 4543 4544 # If analysis contain proband param 4545 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4546 "proband", {} 4547 ): 4548 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4549 4550 # If no proband (usually uniq sample) 4551 else: 4552 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4553 4554 # Log 4555 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4556 4557 # Run command 4558 result = subprocess.call( 4559 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4560 ) 4561 if result: 4562 log.error("Exomiser command failed") 4563 raise ValueError("Exomiser command failed") 4564 4565 ### RESULTS ### 4566 ############### 4567 4568 ### Annotate with TSV fields ### 4569 4570 # Init result tsv file 4571 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4572 4573 # Init result tsv file 4574 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4575 4576 # Parse TSV file and explode columns in INFO field 4577 if exomiser_to_info and os.path.exists(output_results_tsv): 4578 4579 # Log 4580 log.debug("Exomiser columns to VCF INFO field") 4581 4582 # Retrieve columns and types 4583 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4584 output_results_tsv_df = self.get_query_to_df(query) 4585 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4586 4587 # Init concat fields for update 4588 sql_query_update_concat_fields = [] 4589 4590 # Fields to avoid 4591 fields_to_avoid = [ 4592 "CONTIG", 4593 "START", 4594 "END", 4595 "REF", 4596 "ALT", 4597 "QUAL", 4598 "FILTER", 4599 "GENOTYPE", 4600 ] 4601 4602 # List all columns to add into header 4603 for header_column in output_results_tsv_columns: 4604 4605 # If header column is enable 4606 if header_column not in fields_to_avoid: 4607 4608 # Header info type 4609 header_info_type = "String" 4610 header_column_df = output_results_tsv_df[header_column] 4611 header_column_df_dtype = header_column_df.dtype 4612 if header_column_df_dtype == object: 4613 if ( 4614 pd.to_numeric(header_column_df, errors="coerce") 4615 .notnull() 4616 .all() 4617 ): 4618 header_info_type = "Float" 4619 else: 4620 header_info_type = "Integer" 4621 4622 # Header info 4623 characters_to_validate = ["-"] 4624 pattern = "[" + "".join(characters_to_validate) + "]" 4625 header_info_name = re.sub( 4626 pattern, 4627 "_", 4628 f"Exomiser_{header_column}".replace("#", ""), 4629 ) 4630 header_info_number = "." 4631 header_info_description = ( 4632 f"Exomiser {header_column} annotation" 4633 ) 4634 header_info_source = "Exomiser" 4635 header_info_version = "unknown" 4636 header_info_code = CODE_TYPE_MAP[header_info_type] 4637 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4638 header_info_name, 4639 header_info_number, 4640 header_info_type, 4641 header_info_description, 4642 header_info_source, 4643 header_info_version, 4644 header_info_code, 4645 ) 4646 4647 # Add field to add for update to concat fields 4648 sql_query_update_concat_fields.append( 4649 f""" 4650 CASE 4651 WHEN table_parquet."{header_column}" NOT IN ('','.') 4652 THEN concat( 4653 '{header_info_name}=', 4654 table_parquet."{header_column}", 4655 ';' 4656 ) 4657 4658 ELSE '' 4659 END 4660 """ 4661 ) 4662 4663 # Update query 4664 sql_query_update = f""" 4665 UPDATE {table_variants} as table_variants 4666 SET INFO = concat( 4667 CASE 4668 WHEN INFO NOT IN ('', '.') 4669 THEN INFO 4670 ELSE '' 4671 END, 4672 CASE 4673 WHEN table_variants.INFO NOT IN ('','.') 4674 THEN ';' 4675 ELSE '' 4676 END, 4677 ( 4678 SELECT 4679 concat( 4680 {",".join(sql_query_update_concat_fields)} 4681 ) 4682 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4683 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4684 AND table_parquet.\"START\" = table_variants.\"POS\" 4685 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4686 AND table_parquet.\"REF\" = table_variants.\"REF\" 4687 ) 4688 ) 4689 ; 4690 """ 4691 4692 # Update 4693 self.conn.execute(sql_query_update) 4694 4695 ### Annotate with VCF INFO field ### 4696 4697 # Init result VCF file 4698 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4699 4700 # If VCF exists 4701 if os.path.exists(output_results_vcf): 4702 4703 # Log 4704 log.debug("Exomiser result VCF update variants") 4705 4706 # Find Exomiser INFO field annotation in header 4707 with gzip.open(output_results_vcf, "rt") as f: 4708 header_list = self.read_vcf_header(f) 4709 exomiser_vcf_header = vcf.Reader( 4710 io.StringIO("\n".join(header_list)) 4711 ) 4712 4713 # Add annotation INFO field to header 4714 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4715 4716 # Update variants with VCF 4717 self.update_from_vcf(output_results_vcf) 4718 4719 return True 4720 4721 def annotation_snpeff(self, threads: int = None) -> None: 4722 """ 4723 This function annotate with snpEff 4724 4725 :param threads: The number of threads to use 4726 :return: the value of the variable "return_value". 4727 """ 4728 4729 # DEBUG 4730 log.debug("Start annotation with snpeff databases") 4731 4732 # Threads 4733 if not threads: 4734 threads = self.get_threads() 4735 log.debug("Threads: " + str(threads)) 4736 4737 # DEBUG 4738 delete_tmp = True 4739 if self.get_config().get("verbosity", "warning") in ["debug"]: 4740 delete_tmp = False 4741 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4742 4743 # Config 4744 config = self.get_config() 4745 log.debug("Config: " + str(config)) 4746 4747 # Config - Folders - Databases 4748 databases_folders = ( 4749 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4750 ) 4751 log.debug("Databases annotations: " + str(databases_folders)) 4752 4753 # # Config - Java 4754 # java_bin = get_bin( 4755 # tool="java", 4756 # bin="java", 4757 # bin_type="bin", 4758 # config=config, 4759 # default_folder="/usr/bin", 4760 # ) 4761 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4762 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4763 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4764 4765 # # Config - snpEff bin 4766 # snpeff_jar = get_bin( 4767 # tool="snpeff", 4768 # bin="snpEff.jar", 4769 # bin_type="jar", 4770 # config=config, 4771 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4772 # ) 4773 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4774 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4775 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4776 4777 # Config - snpEff bin command 4778 snpeff_bin_command = get_bin_command( 4779 bin="snpEff.jar", 4780 tool="snpeff", 4781 bin_type="jar", 4782 config=config, 4783 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4784 ) 4785 if not snpeff_bin_command: 4786 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4787 log.error(msg_err) 4788 raise ValueError(msg_err) 4789 4790 # Config - snpEff databases 4791 snpeff_databases = ( 4792 config.get("folders", {}) 4793 .get("databases", {}) 4794 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4795 ) 4796 snpeff_databases = full_path(snpeff_databases) 4797 if snpeff_databases is not None and snpeff_databases != "": 4798 log.debug(f"Create snpEff databases folder") 4799 if not os.path.exists(snpeff_databases): 4800 os.makedirs(snpeff_databases) 4801 4802 # Param 4803 param = self.get_param() 4804 log.debug("Param: " + str(param)) 4805 4806 # Param 4807 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4808 log.debug("Options: " + str(options)) 4809 4810 # Param - Assembly 4811 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4812 4813 # Param - Options 4814 snpeff_options = ( 4815 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4816 ) 4817 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4818 snpeff_csvstats = ( 4819 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4820 ) 4821 if snpeff_stats: 4822 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4823 snpeff_stats = full_path(snpeff_stats) 4824 snpeff_options += f" -stats {snpeff_stats}" 4825 if snpeff_csvstats: 4826 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4827 snpeff_csvstats = full_path(snpeff_csvstats) 4828 snpeff_options += f" -csvStats {snpeff_csvstats}" 4829 4830 # Data 4831 table_variants = self.get_table_variants() 4832 4833 # Check if not empty 4834 log.debug("Check if not empty") 4835 sql_query_chromosomes = ( 4836 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4837 ) 4838 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4839 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4840 log.info(f"VCF empty") 4841 return 4842 4843 # Export in VCF 4844 log.debug("Create initial file to annotate") 4845 tmp_vcf = NamedTemporaryFile( 4846 prefix=self.get_prefix(), 4847 dir=self.get_tmp_dir(), 4848 suffix=".vcf.gz", 4849 delete=True, 4850 ) 4851 tmp_vcf_name = tmp_vcf.name 4852 4853 # VCF header 4854 vcf_reader = self.get_header() 4855 log.debug("Initial header: " + str(vcf_reader.infos)) 4856 4857 # Existing annotations 4858 for vcf_annotation in self.get_header().infos: 4859 4860 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4861 log.debug( 4862 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4863 ) 4864 4865 # Memory limit 4866 # if config.get("memory", None): 4867 # memory_limit = config.get("memory", "8G") 4868 # else: 4869 # memory_limit = "8G" 4870 memory_limit = self.get_memory("8G") 4871 log.debug(f"memory_limit: {memory_limit}") 4872 4873 # snpEff java options 4874 snpeff_java_options = ( 4875 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4876 ) 4877 log.debug(f"Exomiser java options: {snpeff_java_options}") 4878 4879 force_update_annotation = True 4880 4881 if "ANN" not in self.get_header().infos or force_update_annotation: 4882 4883 # Check snpEff database 4884 log.debug(f"Check snpEff databases {[assembly]}") 4885 databases_download_snpeff( 4886 folder=snpeff_databases, assemblies=[assembly], config=config 4887 ) 4888 4889 # Export VCF file 4890 self.export_variant_vcf( 4891 vcf_file=tmp_vcf_name, 4892 remove_info=True, 4893 add_samples=False, 4894 index=True, 4895 ) 4896 4897 # Tmp file 4898 err_files = [] 4899 tmp_annotate_vcf = NamedTemporaryFile( 4900 prefix=self.get_prefix(), 4901 dir=self.get_tmp_dir(), 4902 suffix=".vcf", 4903 delete=False, 4904 ) 4905 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4906 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4907 err_files.append(tmp_annotate_vcf_name_err) 4908 4909 # Command 4910 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4911 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4912 run_parallel_commands([snpeff_command], 1) 4913 4914 # Error messages 4915 log.info(f"Error/Warning messages:") 4916 error_message_command_all = [] 4917 error_message_command_warning = [] 4918 error_message_command_err = [] 4919 for err_file in err_files: 4920 with open(err_file, "r") as f: 4921 for line in f: 4922 message = line.strip() 4923 error_message_command_all.append(message) 4924 if line.startswith("[W::"): 4925 error_message_command_warning.append(message) 4926 if line.startswith("[E::"): 4927 error_message_command_err.append(f"{err_file}: " + message) 4928 # log info 4929 for message in list( 4930 set(error_message_command_err + error_message_command_warning) 4931 ): 4932 log.info(f" {message}") 4933 # debug info 4934 for message in list(set(error_message_command_all)): 4935 log.debug(f" {message}") 4936 # failed 4937 if len(error_message_command_err): 4938 log.error("Annotation failed: Error in commands") 4939 raise ValueError("Annotation failed: Error in commands") 4940 4941 # Find annotation in header 4942 with open(tmp_annotate_vcf_name, "rt") as f: 4943 header_list = self.read_vcf_header(f) 4944 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4945 4946 for ann in annovar_vcf_header.infos: 4947 if ann not in self.get_header().infos: 4948 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4949 4950 # Update variants 4951 log.info(f"Annotation - Updating...") 4952 self.update_from_vcf(tmp_annotate_vcf_name) 4953 4954 else: 4955 if "ANN" in self.get_header().infos: 4956 log.debug(f"Existing snpEff annotations in VCF") 4957 if force_update_annotation: 4958 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 4959 4960 def annotation_annovar(self, threads: int = None) -> None: 4961 """ 4962 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4963 annotations 4964 4965 :param threads: number of threads to use 4966 :return: the value of the variable "return_value". 4967 """ 4968 4969 # DEBUG 4970 log.debug("Start annotation with Annovar databases") 4971 4972 # Threads 4973 if not threads: 4974 threads = self.get_threads() 4975 log.debug("Threads: " + str(threads)) 4976 4977 # Tmp en Err files 4978 tmp_files = [] 4979 err_files = [] 4980 4981 # DEBUG 4982 delete_tmp = True 4983 if self.get_config().get("verbosity", "warning") in ["debug"]: 4984 delete_tmp = False 4985 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4986 4987 # Config 4988 config = self.get_config() 4989 log.debug("Config: " + str(config)) 4990 4991 # Config - Folders - Databases 4992 databases_folders = ( 4993 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4994 ) 4995 log.debug("Databases annotations: " + str(databases_folders)) 4996 4997 # Config - annovar bin command 4998 annovar_bin_command = get_bin_command( 4999 bin="table_annovar.pl", 5000 tool="annovar", 5001 bin_type="perl", 5002 config=config, 5003 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5004 ) 5005 if not annovar_bin_command: 5006 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5007 log.error(msg_err) 5008 raise ValueError(msg_err) 5009 5010 # Config - BCFTools bin command 5011 bcftools_bin_command = get_bin_command( 5012 bin="bcftools", 5013 tool="bcftools", 5014 bin_type="bin", 5015 config=config, 5016 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5017 ) 5018 if not bcftools_bin_command: 5019 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5020 log.error(msg_err) 5021 raise ValueError(msg_err) 5022 5023 # Config - annovar databases 5024 annovar_databases = ( 5025 config.get("folders", {}) 5026 .get("databases", {}) 5027 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5028 ) 5029 annovar_databases = full_path(annovar_databases) 5030 if annovar_databases != "" and not os.path.exists(annovar_databases): 5031 os.makedirs(annovar_databases) 5032 5033 # Param 5034 param = self.get_param() 5035 log.debug("Param: " + str(param)) 5036 5037 # Param - options 5038 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5039 log.debug("Options: " + str(options)) 5040 5041 # Param - annotations 5042 annotations = ( 5043 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5044 ) 5045 log.debug("Annotations: " + str(annotations)) 5046 5047 # Param - Assembly 5048 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5049 5050 # Annovar database assembly 5051 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5052 if annovar_databases_assembly != "" and not os.path.exists( 5053 annovar_databases_assembly 5054 ): 5055 os.makedirs(annovar_databases_assembly) 5056 5057 # Data 5058 table_variants = self.get_table_variants() 5059 5060 # Check if not empty 5061 log.debug("Check if not empty") 5062 sql_query_chromosomes = ( 5063 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5064 ) 5065 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5066 if not sql_query_chromosomes_df["count"][0]: 5067 log.info(f"VCF empty") 5068 return 5069 5070 # VCF header 5071 vcf_reader = self.get_header() 5072 log.debug("Initial header: " + str(vcf_reader.infos)) 5073 5074 # Existing annotations 5075 for vcf_annotation in self.get_header().infos: 5076 5077 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5078 log.debug( 5079 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5080 ) 5081 5082 force_update_annotation = True 5083 5084 if annotations: 5085 5086 commands = [] 5087 tmp_annotates_vcf_name_list = [] 5088 5089 # Export in VCF 5090 log.debug("Create initial file to annotate") 5091 tmp_vcf = NamedTemporaryFile( 5092 prefix=self.get_prefix(), 5093 dir=self.get_tmp_dir(), 5094 suffix=".vcf.gz", 5095 delete=False, 5096 ) 5097 tmp_vcf_name = tmp_vcf.name 5098 tmp_files.append(tmp_vcf_name) 5099 tmp_files.append(tmp_vcf_name + ".tbi") 5100 5101 # Export VCF file 5102 self.export_variant_vcf( 5103 vcf_file=tmp_vcf_name, 5104 remove_info=".", 5105 add_samples=False, 5106 index=True, 5107 ) 5108 5109 # Create file for field rename 5110 log.debug("Create file for field rename") 5111 tmp_rename = NamedTemporaryFile( 5112 prefix=self.get_prefix(), 5113 dir=self.get_tmp_dir(), 5114 suffix=".rename", 5115 delete=False, 5116 ) 5117 tmp_rename_name = tmp_rename.name 5118 tmp_files.append(tmp_rename_name) 5119 5120 # Check Annovar database 5121 log.debug( 5122 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5123 ) 5124 databases_download_annovar( 5125 folder=annovar_databases, 5126 files=list(annotations.keys()), 5127 assemblies=[assembly], 5128 ) 5129 5130 for annotation in annotations: 5131 annotation_fields = annotations[annotation] 5132 5133 if not annotation_fields: 5134 annotation_fields = {"INFO": None} 5135 5136 log.info(f"Annotations Annovar - database '{annotation}'") 5137 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5138 5139 # Tmp file for annovar 5140 err_files = [] 5141 tmp_annotate_vcf_directory = TemporaryDirectory( 5142 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5143 ) 5144 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5145 tmp_annotate_vcf_name_annovar = ( 5146 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5147 ) 5148 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5149 err_files.append(tmp_annotate_vcf_name_err) 5150 tmp_files.append(tmp_annotate_vcf_name_err) 5151 5152 # Tmp file final vcf annotated by annovar 5153 tmp_annotate_vcf = NamedTemporaryFile( 5154 prefix=self.get_prefix(), 5155 dir=self.get_tmp_dir(), 5156 suffix=".vcf.gz", 5157 delete=False, 5158 ) 5159 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5160 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5161 tmp_files.append(tmp_annotate_vcf_name) 5162 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5163 5164 # Number of fields 5165 annotation_list = [] 5166 annotation_renamed_list = [] 5167 5168 for annotation_field in annotation_fields: 5169 5170 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5171 annotation_fields_new_name = annotation_fields.get( 5172 annotation_field, annotation_field 5173 ) 5174 if not annotation_fields_new_name: 5175 annotation_fields_new_name = annotation_field 5176 5177 if ( 5178 force_update_annotation 5179 or annotation_fields_new_name not in self.get_header().infos 5180 ): 5181 annotation_list.append(annotation_field) 5182 annotation_renamed_list.append(annotation_fields_new_name) 5183 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5184 log.warning( 5185 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5186 ) 5187 5188 # Add rename info 5189 run_parallel_commands( 5190 [ 5191 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5192 ], 5193 1, 5194 ) 5195 5196 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5197 log.debug("annotation_list: " + str(annotation_list)) 5198 5199 # protocol 5200 protocol = annotation 5201 5202 # argument 5203 argument = "" 5204 5205 # operation 5206 operation = "f" 5207 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5208 "ensGene" 5209 ): 5210 operation = "g" 5211 if options.get("genebase", None): 5212 argument = f"""'{options.get("genebase","")}'""" 5213 elif annotation in ["cytoBand"]: 5214 operation = "r" 5215 5216 # argument option 5217 argument_option = "" 5218 if argument != "": 5219 argument_option = " --argument " + argument 5220 5221 # command options 5222 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5223 for option in options: 5224 if option not in ["genebase"]: 5225 command_options += f""" --{option}={options[option]}""" 5226 5227 # Command 5228 5229 # Command - Annovar 5230 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5231 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5232 5233 # Command - start pipe 5234 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5235 5236 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5237 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5238 5239 # Command - Special characters (refGene annotation) 5240 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5241 5242 # Command - Clean empty fields (with value ".") 5243 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5244 5245 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5246 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5247 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5248 # for ann in annotation_renamed_list: 5249 for ann in annotation_list: 5250 annovar_fields_to_keep.append(f"^INFO/{ann}") 5251 5252 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5253 5254 # Command - indexing 5255 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5256 5257 log.debug(f"Annotation - Annovar command: {command_annovar}") 5258 run_parallel_commands([command_annovar], 1) 5259 5260 # Error messages 5261 log.info(f"Error/Warning messages:") 5262 error_message_command_all = [] 5263 error_message_command_warning = [] 5264 error_message_command_err = [] 5265 for err_file in err_files: 5266 with open(err_file, "r") as f: 5267 for line in f: 5268 message = line.strip() 5269 error_message_command_all.append(message) 5270 if line.startswith("[W::") or line.startswith("WARNING"): 5271 error_message_command_warning.append(message) 5272 if line.startswith("[E::") or line.startswith("ERROR"): 5273 error_message_command_err.append( 5274 f"{err_file}: " + message 5275 ) 5276 # log info 5277 for message in list( 5278 set(error_message_command_err + error_message_command_warning) 5279 ): 5280 log.info(f" {message}") 5281 # debug info 5282 for message in list(set(error_message_command_all)): 5283 log.debug(f" {message}") 5284 # failed 5285 if len(error_message_command_err): 5286 log.error("Annotation failed: Error in commands") 5287 raise ValueError("Annotation failed: Error in commands") 5288 5289 if tmp_annotates_vcf_name_list: 5290 5291 # List of annotated files 5292 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5293 5294 # Tmp file 5295 tmp_annotate_vcf = NamedTemporaryFile( 5296 prefix=self.get_prefix(), 5297 dir=self.get_tmp_dir(), 5298 suffix=".vcf.gz", 5299 delete=False, 5300 ) 5301 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5302 tmp_files.append(tmp_annotate_vcf_name) 5303 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5304 err_files.append(tmp_annotate_vcf_name_err) 5305 tmp_files.append(tmp_annotate_vcf_name_err) 5306 5307 # Command merge 5308 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5309 log.info( 5310 f"Annotation Annovar - Annotation merging " 5311 + str(len(tmp_annotates_vcf_name_list)) 5312 + " annotated files" 5313 ) 5314 log.debug(f"Annotation - merge command: {merge_command}") 5315 run_parallel_commands([merge_command], 1) 5316 5317 # Find annotation in header 5318 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5319 header_list = self.read_vcf_header(f) 5320 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5321 5322 for ann in annovar_vcf_header.infos: 5323 if ann not in self.get_header().infos: 5324 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5325 5326 # Update variants 5327 log.info(f"Annotation Annovar - Updating...") 5328 self.update_from_vcf(tmp_annotate_vcf_name) 5329 5330 # Clean files 5331 # Tmp file remove command 5332 if True: 5333 tmp_files_remove_command = "" 5334 if tmp_files: 5335 tmp_files_remove_command = " ".join(tmp_files) 5336 clean_command = f" rm -f {tmp_files_remove_command} " 5337 log.debug(f"Annotation Annovar - Annotation cleaning ") 5338 log.debug(f"Annotation - cleaning command: {clean_command}") 5339 run_parallel_commands([clean_command], 1) 5340 5341 # Parquet 5342 def annotation_parquet(self, threads: int = None) -> None: 5343 """ 5344 It takes a VCF file, and annotates it with a parquet file 5345 5346 :param threads: number of threads to use for the annotation 5347 :return: the value of the variable "result". 5348 """ 5349 5350 # DEBUG 5351 log.debug("Start annotation with parquet databases") 5352 5353 # Threads 5354 if not threads: 5355 threads = self.get_threads() 5356 log.debug("Threads: " + str(threads)) 5357 5358 # DEBUG 5359 delete_tmp = True 5360 if self.get_config().get("verbosity", "warning") in ["debug"]: 5361 delete_tmp = False 5362 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5363 5364 # Config 5365 databases_folders = set( 5366 self.get_config() 5367 .get("folders", {}) 5368 .get("databases", {}) 5369 .get("annotations", ["."]) 5370 + self.get_config() 5371 .get("folders", {}) 5372 .get("databases", {}) 5373 .get("parquet", ["."]) 5374 ) 5375 log.debug("Databases annotations: " + str(databases_folders)) 5376 5377 # Param 5378 annotations = ( 5379 self.get_param() 5380 .get("annotation", {}) 5381 .get("parquet", {}) 5382 .get("annotations", None) 5383 ) 5384 log.debug("Annotations: " + str(annotations)) 5385 5386 # Assembly 5387 assembly = self.get_param().get( 5388 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5389 ) 5390 5391 # Force Update Annotation 5392 force_update_annotation = ( 5393 self.get_param() 5394 .get("annotation", {}) 5395 .get("options", {}) 5396 .get("annotations_update", False) 5397 ) 5398 log.debug(f"force_update_annotation={force_update_annotation}") 5399 force_append_annotation = ( 5400 self.get_param() 5401 .get("annotation", {}) 5402 .get("options", {}) 5403 .get("annotations_append", False) 5404 ) 5405 log.debug(f"force_append_annotation={force_append_annotation}") 5406 5407 # Data 5408 table_variants = self.get_table_variants() 5409 5410 # Check if not empty 5411 log.debug("Check if not empty") 5412 sql_query_chromosomes_df = self.get_query_to_df( 5413 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5414 ) 5415 if not sql_query_chromosomes_df["count"][0]: 5416 log.info(f"VCF empty") 5417 return 5418 5419 # VCF header 5420 vcf_reader = self.get_header() 5421 log.debug("Initial header: " + str(vcf_reader.infos)) 5422 5423 # Nb Variants POS 5424 log.debug("NB Variants Start") 5425 nb_variants = self.conn.execute( 5426 f"SELECT count(*) AS count FROM variants" 5427 ).fetchdf()["count"][0] 5428 log.debug("NB Variants Stop") 5429 5430 # Existing annotations 5431 for vcf_annotation in self.get_header().infos: 5432 5433 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5434 log.debug( 5435 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5436 ) 5437 5438 # Added columns 5439 added_columns = [] 5440 5441 # drop indexes 5442 log.debug(f"Drop indexes...") 5443 self.drop_indexes() 5444 5445 if annotations: 5446 5447 if "ALL" in annotations: 5448 5449 all_param = annotations.get("ALL", {}) 5450 all_param_formats = all_param.get("formats", None) 5451 all_param_releases = all_param.get("releases", None) 5452 5453 databases_infos_dict = self.scan_databases( 5454 database_formats=all_param_formats, 5455 database_releases=all_param_releases, 5456 ) 5457 for database_infos in databases_infos_dict.keys(): 5458 if database_infos not in annotations: 5459 annotations[database_infos] = {"INFO": None} 5460 5461 for annotation in annotations: 5462 5463 if annotation in ["ALL"]: 5464 continue 5465 5466 # Annotation Name 5467 annotation_name = os.path.basename(annotation) 5468 5469 # Annotation fields 5470 annotation_fields = annotations[annotation] 5471 if not annotation_fields: 5472 annotation_fields = {"INFO": None} 5473 5474 log.debug(f"Annotation '{annotation_name}'") 5475 log.debug( 5476 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5477 ) 5478 5479 # Create Database 5480 database = Database( 5481 database=annotation, 5482 databases_folders=databases_folders, 5483 assembly=assembly, 5484 ) 5485 5486 # Find files 5487 parquet_file = database.get_database() 5488 parquet_hdr_file = database.get_header_file() 5489 parquet_type = database.get_type() 5490 5491 # Check if files exists 5492 if not parquet_file or not parquet_hdr_file: 5493 log.error("Annotation failed: file not found") 5494 raise ValueError("Annotation failed: file not found") 5495 else: 5496 # Get parquet connexion 5497 parquet_sql_attach = database.get_sql_database_attach( 5498 output="query" 5499 ) 5500 if parquet_sql_attach: 5501 self.conn.execute(parquet_sql_attach) 5502 parquet_file_link = database.get_sql_database_link() 5503 # Log 5504 log.debug( 5505 f"Annotation '{annotation_name}' - file: " 5506 + str(parquet_file) 5507 + " and " 5508 + str(parquet_hdr_file) 5509 ) 5510 5511 # Database full header columns 5512 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5513 parquet_hdr_file 5514 ) 5515 # Log 5516 log.debug( 5517 "Annotation database header columns : " 5518 + str(parquet_hdr_vcf_header_columns) 5519 ) 5520 5521 # Load header as VCF object 5522 parquet_hdr_vcf_header_infos = database.get_header().infos 5523 # Log 5524 log.debug( 5525 "Annotation database header: " 5526 + str(parquet_hdr_vcf_header_infos) 5527 ) 5528 5529 # Get extra infos 5530 parquet_columns = database.get_extra_columns() 5531 # Log 5532 log.debug("Annotation database Columns: " + str(parquet_columns)) 5533 5534 # Add extra columns if "ALL" in annotation_fields 5535 # if "ALL" in annotation_fields: 5536 # allow_add_extra_column = True 5537 if "ALL" in annotation_fields and database.get_extra_columns(): 5538 for extra_column in database.get_extra_columns(): 5539 if ( 5540 extra_column not in annotation_fields 5541 and extra_column.replace("INFO/", "") 5542 not in parquet_hdr_vcf_header_infos 5543 ): 5544 parquet_hdr_vcf_header_infos[extra_column] = ( 5545 vcf.parser._Info( 5546 extra_column, 5547 ".", 5548 "String", 5549 f"{extra_column} description", 5550 "unknown", 5551 "unknown", 5552 self.code_type_map["String"], 5553 ) 5554 ) 5555 5556 # For all fields in database 5557 annotation_fields_all = False 5558 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5559 annotation_fields_all = True 5560 annotation_fields = { 5561 key: key for key in parquet_hdr_vcf_header_infos 5562 } 5563 5564 log.debug( 5565 "Annotation database header - All annotations added: " 5566 + str(annotation_fields) 5567 ) 5568 5569 # Init 5570 5571 # List of annotation fields to use 5572 sql_query_annotation_update_info_sets = [] 5573 5574 # List of annotation to agregate 5575 sql_query_annotation_to_agregate = [] 5576 5577 # Number of fields 5578 nb_annotation_field = 0 5579 5580 # Annotation fields processed 5581 annotation_fields_processed = [] 5582 5583 # Columns mapping 5584 map_columns = database.map_columns( 5585 columns=annotation_fields, prefixes=["INFO/"] 5586 ) 5587 5588 # Query dict for fields to remove (update option) 5589 query_dict_remove = {} 5590 5591 # Fetch Anotation fields 5592 for annotation_field in annotation_fields: 5593 5594 # annotation_field_column 5595 annotation_field_column = map_columns.get( 5596 annotation_field, "INFO" 5597 ) 5598 5599 # field new name, if parametered 5600 annotation_fields_new_name = annotation_fields.get( 5601 annotation_field, annotation_field 5602 ) 5603 if not annotation_fields_new_name: 5604 annotation_fields_new_name = annotation_field 5605 5606 # To annotate 5607 # force_update_annotation = True 5608 # force_append_annotation = True 5609 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5610 if annotation_field in parquet_hdr_vcf_header_infos and ( 5611 force_update_annotation 5612 or force_append_annotation 5613 or ( 5614 annotation_fields_new_name 5615 not in self.get_header().infos 5616 ) 5617 ): 5618 5619 # Add field to annotation to process list 5620 annotation_fields_processed.append( 5621 annotation_fields_new_name 5622 ) 5623 5624 # explode infos for the field 5625 annotation_fields_new_name_info_msg = "" 5626 if ( 5627 force_update_annotation 5628 and annotation_fields_new_name 5629 in self.get_header().infos 5630 ): 5631 # Remove field from INFO 5632 query = f""" 5633 UPDATE {table_variants} as table_variants 5634 SET INFO = REGEXP_REPLACE( 5635 concat(table_variants.INFO,''), 5636 ';*{annotation_fields_new_name}=[^;]*', 5637 '' 5638 ) 5639 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5640 """ 5641 annotation_fields_new_name_info_msg = " [update]" 5642 query_dict_remove[ 5643 f"remove 'INFO/{annotation_fields_new_name}'" 5644 ] = query 5645 5646 # Sep between fields in INFO 5647 nb_annotation_field += 1 5648 if nb_annotation_field > 1: 5649 annotation_field_sep = ";" 5650 else: 5651 annotation_field_sep = "" 5652 5653 log.info( 5654 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5655 ) 5656 5657 # Add INFO field to header 5658 parquet_hdr_vcf_header_infos_number = ( 5659 parquet_hdr_vcf_header_infos[annotation_field].num 5660 or "." 5661 ) 5662 parquet_hdr_vcf_header_infos_type = ( 5663 parquet_hdr_vcf_header_infos[annotation_field].type 5664 or "String" 5665 ) 5666 parquet_hdr_vcf_header_infos_description = ( 5667 parquet_hdr_vcf_header_infos[annotation_field].desc 5668 or f"{annotation_field} description" 5669 ) 5670 parquet_hdr_vcf_header_infos_source = ( 5671 parquet_hdr_vcf_header_infos[annotation_field].source 5672 or "unknown" 5673 ) 5674 parquet_hdr_vcf_header_infos_version = ( 5675 parquet_hdr_vcf_header_infos[annotation_field].version 5676 or "unknown" 5677 ) 5678 5679 vcf_reader.infos[annotation_fields_new_name] = ( 5680 vcf.parser._Info( 5681 annotation_fields_new_name, 5682 parquet_hdr_vcf_header_infos_number, 5683 parquet_hdr_vcf_header_infos_type, 5684 parquet_hdr_vcf_header_infos_description, 5685 parquet_hdr_vcf_header_infos_source, 5686 parquet_hdr_vcf_header_infos_version, 5687 self.code_type_map[ 5688 parquet_hdr_vcf_header_infos_type 5689 ], 5690 ) 5691 ) 5692 5693 # Append 5694 if force_append_annotation: 5695 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5696 else: 5697 query_case_when_append = "" 5698 5699 # Annotation/Update query fields 5700 # Found in INFO column 5701 if ( 5702 annotation_field_column == "INFO" 5703 and "INFO" in parquet_hdr_vcf_header_columns 5704 ): 5705 sql_query_annotation_update_info_sets.append( 5706 f""" 5707 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5708 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5709 ELSE '' 5710 END 5711 """ 5712 ) 5713 # Found in a specific column 5714 else: 5715 sql_query_annotation_update_info_sets.append( 5716 f""" 5717 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5718 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5719 ELSE '' 5720 END 5721 """ 5722 ) 5723 sql_query_annotation_to_agregate.append( 5724 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5725 ) 5726 5727 # Not to annotate 5728 else: 5729 5730 if force_update_annotation: 5731 annotation_message = "forced" 5732 else: 5733 annotation_message = "skipped" 5734 5735 if annotation_field not in parquet_hdr_vcf_header_infos: 5736 log.warning( 5737 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5738 ) 5739 if annotation_fields_new_name in self.get_header().infos: 5740 log.warning( 5741 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5742 ) 5743 5744 # Check if ALL fields have to be annotated. Thus concat all INFO field 5745 # allow_annotation_full_info = True 5746 allow_annotation_full_info = not force_append_annotation 5747 5748 if parquet_type in ["regions"]: 5749 allow_annotation_full_info = False 5750 5751 if ( 5752 allow_annotation_full_info 5753 and nb_annotation_field == len(annotation_fields) 5754 and annotation_fields_all 5755 and ( 5756 "INFO" in parquet_hdr_vcf_header_columns 5757 and "INFO" in database.get_extra_columns() 5758 ) 5759 ): 5760 log.debug("Column INFO annotation enabled") 5761 sql_query_annotation_update_info_sets = [] 5762 sql_query_annotation_update_info_sets.append( 5763 f" table_parquet.INFO " 5764 ) 5765 5766 if sql_query_annotation_update_info_sets: 5767 5768 # Annotate 5769 log.info(f"Annotation '{annotation_name}' - Annotation...") 5770 5771 # Join query annotation update info sets for SQL 5772 sql_query_annotation_update_info_sets_sql = ",".join( 5773 sql_query_annotation_update_info_sets 5774 ) 5775 5776 # Check chromosomes list (and variants infos) 5777 sql_query_chromosomes = f""" 5778 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5779 FROM {table_variants} as table_variants 5780 GROUP BY table_variants."#CHROM" 5781 ORDER BY table_variants."#CHROM" 5782 """ 5783 sql_query_chromosomes_df = self.conn.execute( 5784 sql_query_chromosomes 5785 ).df() 5786 sql_query_chromosomes_dict = { 5787 entry["CHROM"]: { 5788 "count": entry["count_variants"], 5789 "min": entry["min_variants"], 5790 "max": entry["max_variants"], 5791 } 5792 for index, entry in sql_query_chromosomes_df.iterrows() 5793 } 5794 5795 # Init 5796 nb_of_query = 0 5797 nb_of_variant_annotated = 0 5798 query_dict = query_dict_remove 5799 5800 # for chrom in sql_query_chromosomes_df["CHROM"]: 5801 for chrom in sql_query_chromosomes_dict: 5802 5803 # Number of variant by chromosome 5804 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5805 chrom, {} 5806 ).get("count", 0) 5807 5808 log.debug( 5809 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5810 ) 5811 5812 # Annotation with regions database 5813 if parquet_type in ["regions"]: 5814 sql_query_annotation_from_clause = f""" 5815 FROM ( 5816 SELECT 5817 '{chrom}' AS \"#CHROM\", 5818 table_variants_from.\"POS\" AS \"POS\", 5819 {",".join(sql_query_annotation_to_agregate)} 5820 FROM {table_variants} as table_variants_from 5821 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5822 table_parquet_from."#CHROM" = '{chrom}' 5823 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5824 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5825 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5826 ) 5827 ) 5828 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5829 GROUP BY table_variants_from.\"POS\" 5830 ) 5831 as table_parquet 5832 """ 5833 5834 sql_query_annotation_where_clause = """ 5835 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5836 AND table_parquet.\"POS\" = table_variants.\"POS\" 5837 """ 5838 5839 # Annotation with variants database 5840 else: 5841 sql_query_annotation_from_clause = f""" 5842 FROM {parquet_file_link} as table_parquet 5843 """ 5844 sql_query_annotation_where_clause = f""" 5845 table_variants."#CHROM" = '{chrom}' 5846 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5847 AND table_parquet.\"POS\" = table_variants.\"POS\" 5848 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5849 AND table_parquet.\"REF\" = table_variants.\"REF\" 5850 """ 5851 5852 # Create update query 5853 sql_query_annotation_chrom_interval_pos = f""" 5854 UPDATE {table_variants} as table_variants 5855 SET INFO = 5856 concat( 5857 CASE WHEN table_variants.INFO NOT IN ('','.') 5858 THEN table_variants.INFO 5859 ELSE '' 5860 END 5861 , 5862 CASE WHEN table_variants.INFO NOT IN ('','.') 5863 AND ( 5864 concat({sql_query_annotation_update_info_sets_sql}) 5865 ) 5866 NOT IN ('','.') 5867 THEN ';' 5868 ELSE '' 5869 END 5870 , 5871 {sql_query_annotation_update_info_sets_sql} 5872 ) 5873 {sql_query_annotation_from_clause} 5874 WHERE {sql_query_annotation_where_clause} 5875 ; 5876 """ 5877 5878 # Add update query to dict 5879 query_dict[ 5880 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5881 ] = sql_query_annotation_chrom_interval_pos 5882 5883 nb_of_query = len(query_dict) 5884 num_query = 0 5885 5886 # SET max_expression_depth TO x 5887 self.conn.execute("SET max_expression_depth TO 10000") 5888 5889 for query_name in query_dict: 5890 query = query_dict[query_name] 5891 num_query += 1 5892 log.info( 5893 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5894 ) 5895 result = self.conn.execute(query) 5896 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5897 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5898 log.info( 5899 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5900 ) 5901 5902 log.info( 5903 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5904 ) 5905 5906 else: 5907 5908 log.info( 5909 f"Annotation '{annotation_name}' - No Annotations available" 5910 ) 5911 5912 log.debug("Final header: " + str(vcf_reader.infos)) 5913 5914 # Remove added columns 5915 for added_column in added_columns: 5916 self.drop_column(column=added_column) 5917 5918 def annotation_splice(self, threads: int = None) -> None: 5919 """ 5920 This function annotate with snpEff 5921 5922 :param threads: The number of threads to use 5923 :return: the value of the variable "return_value". 5924 """ 5925 5926 # DEBUG 5927 log.debug("Start annotation with splice tools") 5928 5929 # Threads 5930 if not threads: 5931 threads = self.get_threads() 5932 log.debug("Threads: " + str(threads)) 5933 5934 # DEBUG 5935 delete_tmp = True 5936 if self.get_config().get("verbosity", "warning") in ["debug"]: 5937 delete_tmp = False 5938 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5939 5940 # Config 5941 config = self.get_config() 5942 log.debug("Config: " + str(config)) 5943 splice_config = config.get("tools", {}).get("splice", {}) 5944 if not splice_config: 5945 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5946 if not splice_config: 5947 msg_err = "No Splice tool config" 5948 log.error(msg_err) 5949 raise ValueError(msg_err) 5950 log.debug(f"splice_config={splice_config}") 5951 5952 # Config - Folders - Databases 5953 databases_folders = ( 5954 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5955 ) 5956 log.debug("Databases annotations: " + str(databases_folders)) 5957 5958 # Splice docker image 5959 splice_docker_image = splice_config.get("docker").get("image") 5960 5961 # Pull splice image if it's not already there 5962 if not check_docker_image_exists(splice_docker_image): 5963 log.warning( 5964 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5965 ) 5966 try: 5967 command(f"docker pull {splice_config.get('docker').get('image')}") 5968 except subprocess.CalledProcessError: 5969 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5970 log.error(msg_err) 5971 raise ValueError(msg_err) 5972 return None 5973 5974 # Config - splice databases 5975 splice_databases = ( 5976 config.get("folders", {}) 5977 .get("databases", {}) 5978 .get("splice", DEFAULT_SPLICE_FOLDER) 5979 ) 5980 splice_databases = full_path(splice_databases) 5981 5982 # Param 5983 param = self.get_param() 5984 log.debug("Param: " + str(param)) 5985 5986 # Param 5987 options = param.get("annotation", {}).get("splice", {}) 5988 log.debug("Options: " + str(options)) 5989 5990 # Data 5991 table_variants = self.get_table_variants() 5992 5993 # Check if not empty 5994 log.debug("Check if not empty") 5995 sql_query_chromosomes = ( 5996 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5997 ) 5998 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5999 log.info("VCF empty") 6000 return None 6001 6002 # Export in VCF 6003 log.debug("Create initial file to annotate") 6004 6005 # Create output folder 6006 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6007 if not os.path.exists(output_folder): 6008 Path(output_folder).mkdir(parents=True, exist_ok=True) 6009 6010 # Create tmp VCF file 6011 tmp_vcf = NamedTemporaryFile( 6012 prefix=self.get_prefix(), 6013 dir=output_folder, 6014 suffix=".vcf", 6015 delete=False, 6016 ) 6017 tmp_vcf_name = tmp_vcf.name 6018 6019 # VCF header 6020 header = self.get_header() 6021 6022 # Existing annotations 6023 for vcf_annotation in self.get_header().infos: 6024 6025 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6026 log.debug( 6027 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6028 ) 6029 6030 # Memory limit 6031 if config.get("memory", None): 6032 memory_limit = config.get("memory", "8G").upper() 6033 # upper() 6034 else: 6035 memory_limit = "8G" 6036 log.debug(f"memory_limit: {memory_limit}") 6037 6038 # Check number of variants to annotate 6039 where_clause_regex_spliceai = r"SpliceAI_\w+" 6040 where_clause_regex_spip = r"SPiP_\w+" 6041 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6042 df_list_of_variants_to_annotate = self.get_query_to_df( 6043 query=f""" SELECT * FROM variants {where_clause} """ 6044 ) 6045 if len(df_list_of_variants_to_annotate) == 0: 6046 log.warning( 6047 f"No variants to annotate with splice. Variants probably already annotated with splice" 6048 ) 6049 return None 6050 else: 6051 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6052 6053 # Export VCF file 6054 self.export_variant_vcf( 6055 vcf_file=tmp_vcf_name, 6056 remove_info=True, 6057 add_samples=True, 6058 index=False, 6059 where_clause=where_clause, 6060 ) 6061 6062 # Create docker container and launch splice analysis 6063 if splice_config: 6064 6065 # Splice mount folders 6066 mount_folders = splice_config.get("mount", {}) 6067 6068 # Genome mount 6069 mount_folders[ 6070 config.get("folders", {}) 6071 .get("databases", {}) 6072 .get("genomes", DEFAULT_GENOME_FOLDER) 6073 ] = "ro" 6074 6075 # SpliceAI mount 6076 mount_folders[ 6077 config.get("folders", {}) 6078 .get("databases", {}) 6079 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6080 ] = "ro" 6081 6082 # Genome mount 6083 mount_folders[ 6084 config.get("folders", {}) 6085 .get("databases", {}) 6086 .get("spip", DEFAULT_SPIP_FOLDER) 6087 ] = "ro" 6088 6089 # Mount folders 6090 mount = [] 6091 6092 # Config mount 6093 mount = [ 6094 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6095 for path, mode in mount_folders.items() 6096 ] 6097 6098 if any(value for value in splice_config.values() if value is None): 6099 log.warning("At least one splice config parameter is empty") 6100 return None 6101 6102 # Params in splice nf 6103 def check_values(dico: dict): 6104 """ 6105 Ensure parameters for NF splice pipeline 6106 """ 6107 for key, val in dico.items(): 6108 if key == "genome": 6109 if any( 6110 assemb in options.get("genome", {}) 6111 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6112 ): 6113 yield f"--{key} hg19" 6114 elif any( 6115 assemb in options.get("genome", {}) 6116 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6117 ): 6118 yield f"--{key} hg38" 6119 elif ( 6120 (isinstance(val, str) and val) 6121 or isinstance(val, int) 6122 or isinstance(val, bool) 6123 ): 6124 yield f"--{key} {val}" 6125 6126 # Genome 6127 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6128 options["genome"] = genome 6129 6130 # NF params 6131 nf_params = [] 6132 6133 # Add options 6134 if options: 6135 nf_params = list(check_values(options)) 6136 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6137 else: 6138 log.debug("No NF params provided") 6139 6140 # Add threads 6141 if "threads" not in options.keys(): 6142 nf_params.append(f"--threads {threads}") 6143 6144 # Genome path 6145 genome_path = find_genome( 6146 config.get("folders", {}) 6147 .get("databases", {}) 6148 .get("genomes", DEFAULT_GENOME_FOLDER), 6149 file=f"{genome}.fa", 6150 ) 6151 # Add genome path 6152 if not genome_path: 6153 raise ValueError( 6154 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6155 ) 6156 else: 6157 log.debug(f"Genome: {genome_path}") 6158 nf_params.append(f"--genome_path {genome_path}") 6159 6160 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6161 """ 6162 Setting up updated databases for SPiP and SpliceAI 6163 """ 6164 6165 try: 6166 6167 # SpliceAI assembly transcriptome 6168 spliceai_assembly = os.path.join( 6169 config.get("folders", {}) 6170 .get("databases", {}) 6171 .get("spliceai", {}), 6172 options.get("genome"), 6173 "transcriptome", 6174 ) 6175 spip_assembly = options.get("genome") 6176 6177 spip = find( 6178 f"transcriptome_{spip_assembly}.RData", 6179 config.get("folders", {}).get("databases", {}).get("spip", {}), 6180 ) 6181 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6182 log.debug(f"SPiP annotations: {spip}") 6183 log.debug(f"SpliceAI annotations: {spliceai}") 6184 if spip and spliceai: 6185 return [ 6186 f"--spip_transcriptome {spip}", 6187 f"--spliceai_annotations {spliceai}", 6188 ] 6189 else: 6190 # TODO crash and go on with basic annotations ? 6191 # raise ValueError( 6192 # "Can't find splice databases in configuration EXIT" 6193 # ) 6194 log.warning( 6195 "Can't find splice databases in configuration, use annotations file from image" 6196 ) 6197 except TypeError: 6198 log.warning( 6199 "Can't find splice databases in configuration, use annotations file from image" 6200 ) 6201 return [] 6202 6203 # Add options, check if transcriptome option have already beend provided 6204 if ( 6205 "spip_transcriptome" not in nf_params 6206 and "spliceai_transcriptome" not in nf_params 6207 ): 6208 splice_reference = splice_annotations(options, config) 6209 if splice_reference: 6210 nf_params.extend(splice_reference) 6211 6212 nf_params.append(f"--output_folder {output_folder}") 6213 6214 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6215 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6216 log.debug(cmd) 6217 6218 splice_config["docker"]["command"] = cmd 6219 6220 docker_cmd = get_bin_command( 6221 tool="splice", 6222 bin_type="docker", 6223 config=config, 6224 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6225 add_options=f"--name {random_uuid} {' '.join(mount)}", 6226 ) 6227 6228 # Docker debug 6229 # if splice_config.get("rm_container"): 6230 # rm_container = "--rm" 6231 # else: 6232 # rm_container = "" 6233 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6234 6235 log.debug(docker_cmd) 6236 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6237 log.debug(res.stdout) 6238 if res.stderr: 6239 log.error(res.stderr) 6240 res.check_returncode() 6241 else: 6242 log.warning(f"Splice tool configuration not found: {config}") 6243 6244 # Update variants 6245 log.info("Annotation - Updating...") 6246 # Test find output vcf 6247 log.debug( 6248 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6249 ) 6250 output_vcf = [] 6251 # Wrong folder to look in 6252 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6253 if ( 6254 files 6255 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6256 ): 6257 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6258 # log.debug(os.listdir(options.get("output_folder"))) 6259 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6260 if not output_vcf: 6261 log.debug( 6262 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6263 ) 6264 else: 6265 # Get new header from annotated vcf 6266 log.debug(f"Initial header: {len(header.infos)} fields") 6267 # Create new header with splice infos 6268 new_vcf = Variants(input=output_vcf[0]) 6269 new_vcf_header = new_vcf.get_header().infos 6270 for keys, infos in new_vcf_header.items(): 6271 if keys not in header.infos.keys(): 6272 header.infos[keys] = infos 6273 log.debug(f"New header: {len(header.infos)} fields") 6274 log.debug(f"Splice tmp output: {output_vcf[0]}") 6275 self.update_from_vcf(output_vcf[0]) 6276 6277 # Remove folder 6278 remove_if_exists(output_folder) 6279 6280 ### 6281 # Prioritization 6282 ### 6283 6284 def get_config_default(self, name: str) -> dict: 6285 """ 6286 The function `get_config_default` returns a dictionary containing default configurations for 6287 various calculations and prioritizations. 6288 6289 :param name: The `get_config_default` function returns a dictionary containing default 6290 configurations for different calculations and prioritizations. The `name` parameter is used to 6291 specify which specific configuration to retrieve from the dictionary 6292 :type name: str 6293 :return: The function `get_config_default` returns a dictionary containing default configuration 6294 settings for different calculations and prioritizations. The specific configuration settings are 6295 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6296 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6297 returned. If there is no match, an empty dictionary is returned. 6298 """ 6299 6300 config_default = { 6301 "calculations": { 6302 "variant_chr_pos_alt_ref": { 6303 "type": "sql", 6304 "name": "variant_chr_pos_alt_ref", 6305 "description": "Create a variant ID with chromosome, position, alt and ref", 6306 "available": False, 6307 "output_column_name": "variant_chr_pos_alt_ref", 6308 "output_column_type": "String", 6309 "output_column_description": "variant ID with chromosome, position, alt and ref", 6310 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6311 "operation_info": True, 6312 }, 6313 "VARTYPE": { 6314 "type": "sql", 6315 "name": "VARTYPE", 6316 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6317 "available": True, 6318 "output_column_name": "VARTYPE", 6319 "output_column_type": "String", 6320 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6321 "operation_query": """ 6322 CASE 6323 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6324 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6325 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6326 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6327 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6328 ELSE 'UNDEFINED' 6329 END 6330 """, 6331 "info_fields": ["SVTYPE"], 6332 "operation_info": True, 6333 }, 6334 "snpeff_hgvs": { 6335 "type": "python", 6336 "name": "snpeff_hgvs", 6337 "description": "HGVS nomenclatures from snpEff annotation", 6338 "available": True, 6339 "function_name": "calculation_extract_snpeff_hgvs", 6340 "function_params": ["snpeff_hgvs", "ANN"], 6341 }, 6342 "snpeff_ann_explode": { 6343 "type": "python", 6344 "name": "snpeff_ann_explode", 6345 "description": "Explode snpEff annotations with uniquify values", 6346 "available": True, 6347 "function_name": "calculation_snpeff_ann_explode", 6348 "function_params": [False, "fields", "snpeff_", "ANN"], 6349 }, 6350 "snpeff_ann_explode_uniquify": { 6351 "type": "python", 6352 "name": "snpeff_ann_explode_uniquify", 6353 "description": "Explode snpEff annotations", 6354 "available": True, 6355 "function_name": "calculation_snpeff_ann_explode", 6356 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6357 }, 6358 "snpeff_ann_explode_json": { 6359 "type": "python", 6360 "name": "snpeff_ann_explode_json", 6361 "description": "Explode snpEff annotations in JSON format", 6362 "available": True, 6363 "function_name": "calculation_snpeff_ann_explode", 6364 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6365 }, 6366 "NOMEN": { 6367 "type": "python", 6368 "name": "NOMEN", 6369 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6370 "available": True, 6371 "function_name": "calculation_extract_nomen", 6372 "function_params": [], 6373 }, 6374 "FINDBYPIPELINE": { 6375 "type": "python", 6376 "name": "FINDBYPIPELINE", 6377 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6378 "available": True, 6379 "function_name": "calculation_find_by_pipeline", 6380 "function_params": ["findbypipeline"], 6381 }, 6382 "FINDBYSAMPLE": { 6383 "type": "python", 6384 "name": "FINDBYSAMPLE", 6385 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6386 "available": True, 6387 "function_name": "calculation_find_by_pipeline", 6388 "function_params": ["findbysample"], 6389 }, 6390 "GENOTYPECONCORDANCE": { 6391 "type": "python", 6392 "name": "GENOTYPECONCORDANCE", 6393 "description": "Concordance of genotype for multi caller VCF", 6394 "available": True, 6395 "function_name": "calculation_genotype_concordance", 6396 "function_params": [], 6397 }, 6398 "BARCODE": { 6399 "type": "python", 6400 "name": "BARCODE", 6401 "description": "BARCODE as VaRank tool", 6402 "available": True, 6403 "function_name": "calculation_barcode", 6404 "function_params": [], 6405 }, 6406 "BARCODEFAMILY": { 6407 "type": "python", 6408 "name": "BARCODEFAMILY", 6409 "description": "BARCODEFAMILY as VaRank tool", 6410 "available": True, 6411 "function_name": "calculation_barcode_family", 6412 "function_params": ["BCF"], 6413 }, 6414 "TRIO": { 6415 "type": "python", 6416 "name": "TRIO", 6417 "description": "Inheritance for a trio family", 6418 "available": True, 6419 "function_name": "calculation_trio", 6420 "function_params": [], 6421 }, 6422 "VAF": { 6423 "type": "python", 6424 "name": "VAF", 6425 "description": "Variant Allele Frequency (VAF) harmonization", 6426 "available": True, 6427 "function_name": "calculation_vaf_normalization", 6428 "function_params": [], 6429 }, 6430 "VAF_stats": { 6431 "type": "python", 6432 "name": "VAF_stats", 6433 "description": "Variant Allele Frequency (VAF) statistics", 6434 "available": True, 6435 "function_name": "calculation_genotype_stats", 6436 "function_params": ["VAF"], 6437 }, 6438 "DP_stats": { 6439 "type": "python", 6440 "name": "DP_stats", 6441 "description": "Depth (DP) statistics", 6442 "available": True, 6443 "function_name": "calculation_genotype_stats", 6444 "function_params": ["DP"], 6445 }, 6446 "variant_id": { 6447 "type": "python", 6448 "name": "variant_id", 6449 "description": "Variant ID generated from variant position and type", 6450 "available": True, 6451 "function_name": "calculation_variant_id", 6452 "function_params": [], 6453 }, 6454 "transcripts_json": { 6455 "type": "python", 6456 "name": "transcripts_json", 6457 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6458 "available": True, 6459 "function_name": "calculation_transcripts_annotation", 6460 "function_params": ["transcripts_json", None], 6461 }, 6462 "transcripts_ann": { 6463 "type": "python", 6464 "name": "transcripts_ann", 6465 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6466 "available": True, 6467 "function_name": "calculation_transcripts_annotation", 6468 "function_params": [None, "transcripts_ann"], 6469 }, 6470 "transcripts_annotations": { 6471 "type": "python", 6472 "name": "transcripts_annotations", 6473 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6474 "available": True, 6475 "function_name": "calculation_transcripts_annotation", 6476 "function_params": [None, None], 6477 }, 6478 "transcripts_prioritization": { 6479 "type": "python", 6480 "name": "transcripts_prioritization", 6481 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6482 "available": True, 6483 "function_name": "calculation_transcripts_prioritization", 6484 "function_params": [], 6485 }, 6486 }, 6487 "prioritizations": { 6488 "default": { 6489 "filter": [ 6490 { 6491 "type": "notequals", 6492 "value": "!PASS|\\.", 6493 "score": 0, 6494 "flag": "FILTERED", 6495 "comment": ["Bad variant quality"], 6496 }, 6497 { 6498 "type": "equals", 6499 "value": "REJECT", 6500 "score": -20, 6501 "flag": "PASS", 6502 "comment": ["Bad variant quality"], 6503 }, 6504 ], 6505 "DP": [ 6506 { 6507 "type": "gte", 6508 "value": "50", 6509 "score": 5, 6510 "flag": "PASS", 6511 "comment": ["DP higher than 50"], 6512 } 6513 ], 6514 "ANN": [ 6515 { 6516 "type": "contains", 6517 "value": "HIGH", 6518 "score": 5, 6519 "flag": "PASS", 6520 "comment": [ 6521 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6522 ], 6523 }, 6524 { 6525 "type": "contains", 6526 "value": "MODERATE", 6527 "score": 3, 6528 "flag": "PASS", 6529 "comment": [ 6530 "A non-disruptive variant that might change protein effectiveness" 6531 ], 6532 }, 6533 { 6534 "type": "contains", 6535 "value": "LOW", 6536 "score": 0, 6537 "flag": "FILTERED", 6538 "comment": [ 6539 "Assumed to be mostly harmless or unlikely to change protein behavior" 6540 ], 6541 }, 6542 { 6543 "type": "contains", 6544 "value": "MODIFIER", 6545 "score": 0, 6546 "flag": "FILTERED", 6547 "comment": [ 6548 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6549 ], 6550 }, 6551 ], 6552 } 6553 }, 6554 } 6555 6556 return config_default.get(name, None) 6557 6558 def get_config_json( 6559 self, name: str, config_dict: dict = {}, config_file: str = None 6560 ) -> dict: 6561 """ 6562 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6563 default values, a dictionary, and a file. 6564 6565 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6566 the name of the configuration. It is used to identify and retrieve the configuration settings 6567 for a specific component or module 6568 :type name: str 6569 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6570 dictionary that allows you to provide additional configuration settings or overrides. When you 6571 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6572 the key is the configuration setting you want to override or 6573 :type config_dict: dict 6574 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6575 specify the path to a configuration file that contains additional settings. If provided, the 6576 function will read the contents of this file and update the configuration dictionary with the 6577 values found in the file, overriding any existing values with the 6578 :type config_file: str 6579 :return: The function `get_config_json` returns a dictionary containing the configuration 6580 settings. 6581 """ 6582 6583 # Create with default prioritizations 6584 config_default = self.get_config_default(name=name) 6585 configuration = config_default 6586 # log.debug(f"configuration={configuration}") 6587 6588 # Replace prioritizations from dict 6589 for config in config_dict: 6590 configuration[config] = config_dict[config] 6591 6592 # Replace prioritizations from file 6593 config_file = full_path(config_file) 6594 if config_file: 6595 if os.path.exists(config_file): 6596 with open(config_file) as config_file_content: 6597 config_file_dict = json.load(config_file_content) 6598 for config in config_file_dict: 6599 configuration[config] = config_file_dict[config] 6600 else: 6601 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6602 log.error(msg_error) 6603 raise ValueError(msg_error) 6604 6605 return configuration 6606 6607 def prioritization( 6608 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6609 ) -> bool: 6610 """ 6611 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6612 prioritizes variants based on configured profiles and criteria. 6613 6614 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6615 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6616 a table name is provided, the method will prioritize the variants in that specific table 6617 :type table: str 6618 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6619 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6620 provided, the code will use a default prefix value of "PZ" 6621 :type pz_prefix: str 6622 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6623 additional parameters specific to the prioritization process. These parameters can include 6624 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6625 configurations needed for the prioritization of variants in a V 6626 :type pz_param: dict 6627 :return: A boolean value (True) is being returned from the `prioritization` function. 6628 """ 6629 6630 # Config 6631 config = self.get_config() 6632 6633 # Param 6634 param = self.get_param() 6635 6636 # Prioritization param 6637 if pz_param is not None: 6638 prioritization_param = pz_param 6639 else: 6640 prioritization_param = param.get("prioritization", {}) 6641 6642 # Configuration profiles 6643 prioritization_config_file = prioritization_param.get( 6644 "prioritization_config", None 6645 ) 6646 prioritization_config_file = full_path(prioritization_config_file) 6647 prioritizations_config = self.get_config_json( 6648 name="prioritizations", config_file=prioritization_config_file 6649 ) 6650 6651 # Prioritization prefix 6652 pz_prefix_default = "PZ" 6653 if pz_prefix is None: 6654 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6655 6656 # Prioritization options 6657 profiles = prioritization_param.get("profiles", []) 6658 if isinstance(profiles, str): 6659 profiles = profiles.split(",") 6660 pzfields = prioritization_param.get( 6661 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6662 ) 6663 if isinstance(pzfields, str): 6664 pzfields = pzfields.split(",") 6665 default_profile = prioritization_param.get("default_profile", None) 6666 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6667 prioritization_score_mode = prioritization_param.get( 6668 "prioritization_score_mode", "HOWARD" 6669 ) 6670 6671 # Quick Prioritizations 6672 prioritizations = param.get("prioritizations", None) 6673 if prioritizations: 6674 log.info("Quick Prioritization:") 6675 for profile in prioritizations.split(","): 6676 if profile not in profiles: 6677 profiles.append(profile) 6678 log.info(f" {profile}") 6679 6680 # If profile "ALL" provided, all profiles in the config profiles 6681 if "ALL" in profiles: 6682 profiles = list(prioritizations_config.keys()) 6683 6684 for profile in profiles: 6685 if prioritizations_config.get(profile, None): 6686 log.debug(f"Profile '{profile}' configured") 6687 else: 6688 msg_error = f"Profile '{profile}' NOT configured" 6689 log.error(msg_error) 6690 raise ValueError(msg_error) 6691 6692 if profiles: 6693 log.info(f"Prioritization... ") 6694 else: 6695 log.debug(f"No profile defined") 6696 return False 6697 6698 if not default_profile and len(profiles): 6699 default_profile = profiles[0] 6700 6701 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6702 log.debug("Profiles to check: " + str(list(profiles))) 6703 6704 # Variables 6705 if table is not None: 6706 table_variants = table 6707 else: 6708 table_variants = self.get_table_variants(clause="update") 6709 log.debug(f"Table to prioritize: {table_variants}") 6710 6711 # Added columns 6712 added_columns = [] 6713 6714 # Create list of PZfields 6715 # List of PZFields 6716 list_of_pzfields_original = pzfields + [ 6717 pzfield + pzfields_sep + profile 6718 for pzfield in pzfields 6719 for profile in profiles 6720 ] 6721 list_of_pzfields = [] 6722 log.debug(f"{list_of_pzfields_original}") 6723 6724 # Remove existing PZfields to use if exists 6725 for pzfield in list_of_pzfields_original: 6726 if self.get_header().infos.get(pzfield, None) is None: 6727 list_of_pzfields.append(pzfield) 6728 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6729 else: 6730 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6731 6732 if list_of_pzfields: 6733 6734 # Explode Infos prefix 6735 explode_infos_prefix = self.get_explode_infos_prefix() 6736 6737 # PZfields tags description 6738 PZfields_INFOS = { 6739 f"{pz_prefix}Tags": { 6740 "ID": f"{pz_prefix}Tags", 6741 "Number": ".", 6742 "Type": "String", 6743 "Description": "Variant tags based on annotation criteria", 6744 }, 6745 f"{pz_prefix}Score": { 6746 "ID": f"{pz_prefix}Score", 6747 "Number": 1, 6748 "Type": "Integer", 6749 "Description": "Variant score based on annotation criteria", 6750 }, 6751 f"{pz_prefix}Flag": { 6752 "ID": f"{pz_prefix}Flag", 6753 "Number": 1, 6754 "Type": "String", 6755 "Description": "Variant flag based on annotation criteria", 6756 }, 6757 f"{pz_prefix}Comment": { 6758 "ID": f"{pz_prefix}Comment", 6759 "Number": ".", 6760 "Type": "String", 6761 "Description": "Variant comment based on annotation criteria", 6762 }, 6763 f"{pz_prefix}Infos": { 6764 "ID": f"{pz_prefix}Infos", 6765 "Number": ".", 6766 "Type": "String", 6767 "Description": "Variant infos based on annotation criteria", 6768 }, 6769 } 6770 6771 # Create INFO fields if not exist 6772 for field in PZfields_INFOS: 6773 field_ID = PZfields_INFOS[field]["ID"] 6774 field_description = PZfields_INFOS[field]["Description"] 6775 if field_ID not in self.get_header().infos and field_ID in pzfields: 6776 field_description = ( 6777 PZfields_INFOS[field]["Description"] 6778 + f", profile {default_profile}" 6779 ) 6780 self.get_header().infos[field_ID] = vcf.parser._Info( 6781 field_ID, 6782 PZfields_INFOS[field]["Number"], 6783 PZfields_INFOS[field]["Type"], 6784 field_description, 6785 "unknown", 6786 "unknown", 6787 code_type_map[PZfields_INFOS[field]["Type"]], 6788 ) 6789 6790 # Create INFO fields if not exist for each profile 6791 for profile in prioritizations_config: 6792 if profile in profiles or profiles == []: 6793 for field in PZfields_INFOS: 6794 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6795 field_description = ( 6796 PZfields_INFOS[field]["Description"] 6797 + f", profile {profile}" 6798 ) 6799 if ( 6800 field_ID not in self.get_header().infos 6801 and field in pzfields 6802 ): 6803 self.get_header().infos[field_ID] = vcf.parser._Info( 6804 field_ID, 6805 PZfields_INFOS[field]["Number"], 6806 PZfields_INFOS[field]["Type"], 6807 field_description, 6808 "unknown", 6809 "unknown", 6810 code_type_map[PZfields_INFOS[field]["Type"]], 6811 ) 6812 6813 # Header 6814 for pzfield in list_of_pzfields: 6815 if re.match(f"{pz_prefix}Score.*", pzfield): 6816 added_column = self.add_column( 6817 table_name=table_variants, 6818 column_name=pzfield, 6819 column_type="INTEGER", 6820 default_value="0", 6821 ) 6822 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6823 added_column = self.add_column( 6824 table_name=table_variants, 6825 column_name=pzfield, 6826 column_type="BOOLEAN", 6827 default_value="1", 6828 ) 6829 else: 6830 added_column = self.add_column( 6831 table_name=table_variants, 6832 column_name=pzfield, 6833 column_type="STRING", 6834 default_value="''", 6835 ) 6836 added_columns.append(added_column) 6837 6838 # Profiles 6839 if profiles: 6840 6841 # foreach profile in configuration file 6842 for profile in prioritizations_config: 6843 6844 # If profile is asked in param, or ALL are asked (empty profile []) 6845 if profile in profiles or profiles == []: 6846 log.info(f"Profile '{profile}'") 6847 6848 sql_set_info_option = "" 6849 6850 sql_set_info = [] 6851 6852 # PZ fields set 6853 6854 # PZScore 6855 if ( 6856 f"{pz_prefix}Score{pzfields_sep}{profile}" 6857 in list_of_pzfields 6858 ): 6859 sql_set_info.append( 6860 f""" 6861 concat( 6862 '{pz_prefix}Score{pzfields_sep}{profile}=', 6863 {pz_prefix}Score{pzfields_sep}{profile} 6864 ) 6865 """ 6866 ) 6867 if ( 6868 profile == default_profile 6869 and f"{pz_prefix}Score" in list_of_pzfields 6870 ): 6871 sql_set_info.append( 6872 f""" 6873 concat( 6874 '{pz_prefix}Score=', 6875 {pz_prefix}Score{pzfields_sep}{profile} 6876 ) 6877 """ 6878 ) 6879 6880 # PZFlag 6881 if ( 6882 f"{pz_prefix}Flag{pzfields_sep}{profile}" 6883 in list_of_pzfields 6884 ): 6885 sql_set_info.append( 6886 f""" 6887 concat( 6888 '{pz_prefix}Flag{pzfields_sep}{profile}=', 6889 CASE 6890 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6891 THEN 'PASS' 6892 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 6893 THEN 'FILTERED' 6894 END 6895 ) 6896 """ 6897 ) 6898 if ( 6899 profile == default_profile 6900 and f"{pz_prefix}Flag" in list_of_pzfields 6901 ): 6902 sql_set_info.append( 6903 f""" 6904 concat( 6905 '{pz_prefix}Flag=', 6906 CASE 6907 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6908 THEN 'PASS' 6909 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 6910 THEN 'FILTERED' 6911 END 6912 ) 6913 """ 6914 ) 6915 6916 # PZComment 6917 if ( 6918 f"{pz_prefix}Comment{pzfields_sep}{profile}" 6919 in list_of_pzfields 6920 ): 6921 sql_set_info.append( 6922 f""" 6923 CASE 6924 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 6925 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 6926 ELSE '' 6927 END 6928 """ 6929 ) 6930 if ( 6931 profile == default_profile 6932 and f"{pz_prefix}Comment" in list_of_pzfields 6933 ): 6934 sql_set_info.append( 6935 f""" 6936 CASE 6937 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 6938 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 6939 ELSE '' 6940 END 6941 """ 6942 ) 6943 6944 # PZInfos 6945 if ( 6946 f"{pz_prefix}Infos{pzfields_sep}{profile}" 6947 in list_of_pzfields 6948 ): 6949 sql_set_info.append( 6950 f""" 6951 CASE 6952 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 6953 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 6954 ELSE '' 6955 END 6956 """ 6957 ) 6958 if ( 6959 profile == default_profile 6960 and f"{pz_prefix}Infos" in list_of_pzfields 6961 ): 6962 sql_set_info.append( 6963 f""" 6964 CASE 6965 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 6966 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 6967 ELSE '' 6968 END 6969 """ 6970 ) 6971 6972 # Merge PZfields 6973 sql_set_info_option = "" 6974 sql_set_sep = "" 6975 for sql_set in sql_set_info: 6976 if sql_set_sep: 6977 sql_set_info_option += f""" 6978 , concat('{sql_set_sep}', {sql_set}) 6979 """ 6980 else: 6981 sql_set_info_option += f""" 6982 , {sql_set} 6983 """ 6984 sql_set_sep = ";" 6985 6986 sql_queries = [] 6987 for annotation in prioritizations_config[profile]: 6988 6989 # Explode specific annotation 6990 log.debug(f"Explode annotation '{annotation}'") 6991 added_columns += self.explode_infos( 6992 prefix=explode_infos_prefix, 6993 fields=[annotation], 6994 table=table_variants, 6995 ) 6996 extra_infos = self.get_extra_infos(table=table_variants) 6997 6998 # Check if annotation field is present 6999 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 7000 log.debug(f"Annotation '{annotation}' not in data") 7001 continue 7002 else: 7003 log.debug(f"Annotation '{annotation}' in data") 7004 7005 # For each criterions 7006 for criterion in prioritizations_config[profile][ 7007 annotation 7008 ]: 7009 criterion_type = criterion["type"] 7010 criterion_value = criterion["value"] 7011 criterion_score = criterion.get("score", 0) 7012 criterion_flag = criterion.get("flag", "PASS") 7013 criterion_flag_bool = criterion_flag == "PASS" 7014 criterion_comment = ( 7015 ", ".join(criterion.get("comment", [])) 7016 .replace("'", "''") 7017 .replace(";", ",") 7018 .replace("\t", " ") 7019 ) 7020 criterion_infos = ( 7021 str(criterion) 7022 .replace("'", "''") 7023 .replace(";", ",") 7024 .replace("\t", " ") 7025 ) 7026 7027 sql_set = [] 7028 sql_set_info = [] 7029 7030 # PZ fields set 7031 if ( 7032 f"{pz_prefix}Score{pzfields_sep}{profile}" 7033 in list_of_pzfields 7034 ): 7035 if prioritization_score_mode == "HOWARD": 7036 sql_set.append( 7037 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7038 ) 7039 elif prioritization_score_mode == "VaRank": 7040 sql_set.append( 7041 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7042 ) 7043 else: 7044 sql_set.append( 7045 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7046 ) 7047 if ( 7048 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7049 in list_of_pzfields 7050 ): 7051 sql_set.append( 7052 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7053 ) 7054 if ( 7055 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7056 in list_of_pzfields 7057 ): 7058 sql_set.append( 7059 f""" 7060 {pz_prefix}Comment{pzfields_sep}{profile} = 7061 concat( 7062 {pz_prefix}Comment{pzfields_sep}{profile}, 7063 CASE 7064 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7065 THEN ', ' 7066 ELSE '' 7067 END, 7068 '{criterion_comment}' 7069 ) 7070 """ 7071 ) 7072 if ( 7073 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7074 in list_of_pzfields 7075 ): 7076 sql_set.append( 7077 f""" 7078 {pz_prefix}Infos{pzfields_sep}{profile} = 7079 concat( 7080 {pz_prefix}Infos{pzfields_sep}{profile}, 7081 '{criterion_infos}' 7082 ) 7083 """ 7084 ) 7085 sql_set_option = ",".join(sql_set) 7086 7087 # Criterion and comparison 7088 if sql_set_option: 7089 try: 7090 float(criterion_value) 7091 sql_update = f""" 7092 UPDATE {table_variants} 7093 SET {sql_set_option} 7094 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7095 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7096 """ 7097 except: 7098 contains_option = "" 7099 if criterion_type == "contains": 7100 contains_option = ".*" 7101 sql_update = f""" 7102 UPDATE {table_variants} 7103 SET {sql_set_option} 7104 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7105 """ 7106 sql_queries.append(sql_update) 7107 else: 7108 log.warning( 7109 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7110 ) 7111 7112 # PZTags 7113 if ( 7114 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7115 in list_of_pzfields 7116 ): 7117 7118 # Create PZFalgs value 7119 pztags_value = "" 7120 pztags_sep_default = "|" 7121 pztags_sep = "" 7122 for pzfield in pzfields: 7123 if pzfield not in [f"{pz_prefix}Tags"]: 7124 if ( 7125 f"{pzfield}{pzfields_sep}{profile}" 7126 in list_of_pzfields 7127 ): 7128 if pzfield in [f"{pz_prefix}Flag"]: 7129 pztags_value += f"""{pztags_sep}{pzfield}#', 7130 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7131 THEN 'PASS' 7132 ELSE 'FILTERED' 7133 END, '""" 7134 else: 7135 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7136 pztags_sep = pztags_sep_default 7137 7138 # Add Query update for PZFlags 7139 sql_update_pztags = f""" 7140 UPDATE {table_variants} 7141 SET INFO = concat( 7142 INFO, 7143 CASE WHEN INFO NOT in ('','.') 7144 THEN ';' 7145 ELSE '' 7146 END, 7147 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7148 ) 7149 """ 7150 sql_queries.append(sql_update_pztags) 7151 7152 # Add Query update for PZFlags for default 7153 if profile == default_profile: 7154 sql_update_pztags_default = f""" 7155 UPDATE {table_variants} 7156 SET INFO = concat( 7157 INFO, 7158 ';', 7159 '{pz_prefix}Tags={pztags_value}' 7160 ) 7161 """ 7162 sql_queries.append(sql_update_pztags_default) 7163 7164 log.info(f"""Profile '{profile}' - Prioritization... """) 7165 7166 if sql_queries: 7167 7168 for sql_query in sql_queries: 7169 log.debug( 7170 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7171 ) 7172 self.conn.execute(sql_query) 7173 7174 log.info(f"""Profile '{profile}' - Update... """) 7175 sql_query_update = f""" 7176 UPDATE {table_variants} 7177 SET INFO = 7178 concat( 7179 CASE 7180 WHEN INFO NOT IN ('','.') 7181 THEN concat(INFO, ';') 7182 ELSE '' 7183 END 7184 {sql_set_info_option} 7185 ) 7186 """ 7187 self.conn.execute(sql_query_update) 7188 7189 else: 7190 7191 log.warning(f"No profiles in parameters") 7192 7193 # Remove added columns 7194 for added_column in added_columns: 7195 self.drop_column(column=added_column) 7196 7197 # Explode INFOS fields into table fields 7198 if self.get_explode_infos(): 7199 self.explode_infos( 7200 prefix=self.get_explode_infos_prefix(), 7201 fields=self.get_explode_infos_fields(), 7202 force=True, 7203 ) 7204 7205 return True 7206 7207 ### 7208 # HGVS 7209 ### 7210 7211 def annotation_hgvs(self, threads: int = None) -> None: 7212 """ 7213 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7214 coordinates and alleles. 7215 7216 :param threads: The `threads` parameter is an optional integer that specifies the number of 7217 threads to use for parallel processing. If no value is provided, it will default to the number 7218 of threads obtained from the `get_threads()` method 7219 :type threads: int 7220 """ 7221 7222 # Function for each partition of the Dask Dataframe 7223 def partition_function(partition): 7224 """ 7225 The function `partition_function` applies the `annotation_hgvs_partition` function to 7226 each row of a DataFrame called `partition`. 7227 7228 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7229 to be processed 7230 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7231 the "partition" dataframe along the axis 1. 7232 """ 7233 return partition.apply(annotation_hgvs_partition, axis=1) 7234 7235 def annotation_hgvs_partition(row) -> str: 7236 """ 7237 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7238 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7239 7240 :param row: A dictionary-like object that contains the values for the following keys: 7241 :return: a string that contains the HGVS names associated with the given row of data. 7242 """ 7243 7244 chr = row["CHROM"] 7245 pos = row["POS"] 7246 ref = row["REF"] 7247 alt = row["ALT"] 7248 7249 # Find list of associated transcripts 7250 transcripts_list = list( 7251 polars_conn.execute( 7252 f""" 7253 SELECT transcript 7254 FROM refseq_df 7255 WHERE CHROM='{chr}' 7256 AND POS={pos} 7257 """ 7258 )["transcript"] 7259 ) 7260 7261 # Full HGVS annotation in list 7262 hgvs_full_list = [] 7263 7264 for transcript_name in transcripts_list: 7265 7266 # Transcript 7267 transcript = get_transcript( 7268 transcripts=transcripts, transcript_name=transcript_name 7269 ) 7270 # Exon 7271 if use_exon: 7272 exon = transcript.find_exon_number(pos) 7273 else: 7274 exon = None 7275 # Protein 7276 transcript_protein = None 7277 if use_protein or add_protein or full_format: 7278 transcripts_protein = list( 7279 polars_conn.execute( 7280 f""" 7281 SELECT protein 7282 FROM refseqlink_df 7283 WHERE transcript='{transcript_name}' 7284 LIMIT 1 7285 """ 7286 )["protein"] 7287 ) 7288 if len(transcripts_protein): 7289 transcript_protein = transcripts_protein[0] 7290 7291 # HGVS name 7292 hgvs_name = format_hgvs_name( 7293 chr, 7294 pos, 7295 ref, 7296 alt, 7297 genome=genome, 7298 transcript=transcript, 7299 transcript_protein=transcript_protein, 7300 exon=exon, 7301 use_gene=use_gene, 7302 use_protein=use_protein, 7303 full_format=full_format, 7304 use_version=use_version, 7305 codon_type=codon_type, 7306 ) 7307 hgvs_full_list.append(hgvs_name) 7308 if add_protein and not use_protein and not full_format: 7309 hgvs_name = format_hgvs_name( 7310 chr, 7311 pos, 7312 ref, 7313 alt, 7314 genome=genome, 7315 transcript=transcript, 7316 transcript_protein=transcript_protein, 7317 exon=exon, 7318 use_gene=use_gene, 7319 use_protein=True, 7320 full_format=False, 7321 use_version=use_version, 7322 codon_type=codon_type, 7323 ) 7324 hgvs_full_list.append(hgvs_name) 7325 7326 # Create liste of HGVS annotations 7327 hgvs_full = ",".join(hgvs_full_list) 7328 7329 return hgvs_full 7330 7331 # Polars connexion 7332 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7333 7334 # Config 7335 config = self.get_config() 7336 7337 # Databases 7338 # Genome 7339 databases_genomes_folders = ( 7340 config.get("folders", {}) 7341 .get("databases", {}) 7342 .get("genomes", DEFAULT_GENOME_FOLDER) 7343 ) 7344 databases_genome = ( 7345 config.get("folders", {}).get("databases", {}).get("genomes", "") 7346 ) 7347 # refseq database folder 7348 databases_refseq_folders = ( 7349 config.get("folders", {}) 7350 .get("databases", {}) 7351 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7352 ) 7353 # refseq 7354 databases_refseq = config.get("databases", {}).get("refSeq", None) 7355 # refSeqLink 7356 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7357 7358 # Param 7359 param = self.get_param() 7360 7361 # Quick HGVS 7362 if "hgvs_options" in param and param.get("hgvs_options", ""): 7363 log.info(f"Quick HGVS Annotation:") 7364 if not param.get("hgvs", None): 7365 param["hgvs"] = {} 7366 for option in param.get("hgvs_options", "").split(","): 7367 option_var_val = option.split("=") 7368 option_var = option_var_val[0] 7369 if len(option_var_val) > 1: 7370 option_val = option_var_val[1] 7371 else: 7372 option_val = "True" 7373 if option_val.upper() in ["TRUE"]: 7374 option_val = True 7375 elif option_val.upper() in ["FALSE"]: 7376 option_val = False 7377 log.info(f" {option_var}={option_val}") 7378 param["hgvs"][option_var] = option_val 7379 7380 # Check if HGVS annotation enabled 7381 if "hgvs" in param: 7382 log.info(f"HGVS Annotation... ") 7383 for hgvs_option in param.get("hgvs", {}): 7384 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7385 else: 7386 return 7387 7388 # HGVS Param 7389 param_hgvs = param.get("hgvs", {}) 7390 use_exon = param_hgvs.get("use_exon", False) 7391 use_gene = param_hgvs.get("use_gene", False) 7392 use_protein = param_hgvs.get("use_protein", False) 7393 add_protein = param_hgvs.get("add_protein", False) 7394 full_format = param_hgvs.get("full_format", False) 7395 use_version = param_hgvs.get("use_version", False) 7396 codon_type = param_hgvs.get("codon_type", "3") 7397 7398 # refSseq refSeqLink 7399 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7400 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7401 7402 # Assembly 7403 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7404 7405 # Genome 7406 genome_file = None 7407 if find_genome(databases_genome): 7408 genome_file = find_genome(databases_genome) 7409 else: 7410 genome_file = find_genome( 7411 genome_path=databases_genomes_folders, assembly=assembly 7412 ) 7413 log.debug("Genome: " + str(genome_file)) 7414 7415 # refSseq 7416 refseq_file = find_file_prefix( 7417 input_file=databases_refseq, 7418 prefix="ncbiRefSeq", 7419 folder=databases_refseq_folders, 7420 assembly=assembly, 7421 ) 7422 log.debug("refSeq: " + str(refseq_file)) 7423 7424 # refSeqLink 7425 refseqlink_file = find_file_prefix( 7426 input_file=databases_refseqlink, 7427 prefix="ncbiRefSeqLink", 7428 folder=databases_refseq_folders, 7429 assembly=assembly, 7430 ) 7431 log.debug("refSeqLink: " + str(refseqlink_file)) 7432 7433 # Threads 7434 if not threads: 7435 threads = self.get_threads() 7436 log.debug("Threads: " + str(threads)) 7437 7438 # Variables 7439 table_variants = self.get_table_variants(clause="update") 7440 7441 # Get variants SNV and InDel only 7442 query_variants = f""" 7443 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7444 FROM {table_variants} 7445 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7446 """ 7447 df_variants = self.get_query_to_df(query_variants) 7448 7449 # Added columns 7450 added_columns = [] 7451 7452 # Add hgvs column in variants table 7453 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7454 added_column = self.add_column( 7455 table_variants, hgvs_column_name, "STRING", default_value=None 7456 ) 7457 added_columns.append(added_column) 7458 7459 log.debug(f"refSeq loading...") 7460 # refSeq in duckDB 7461 refseq_table = get_refseq_table( 7462 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7463 ) 7464 # Loading all refSeq in Dataframe 7465 refseq_query = f""" 7466 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7467 FROM {refseq_table} 7468 JOIN df_variants ON ( 7469 {refseq_table}.chrom = df_variants.CHROM 7470 AND {refseq_table}.txStart<=df_variants.POS 7471 AND {refseq_table}.txEnd>=df_variants.POS 7472 ) 7473 """ 7474 refseq_df = self.conn.query(refseq_query).pl() 7475 7476 if refseqlink_file: 7477 log.debug(f"refSeqLink loading...") 7478 # refSeqLink in duckDB 7479 refseqlink_table = get_refseq_table( 7480 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7481 ) 7482 # Loading all refSeqLink in Dataframe 7483 protacc_column = "protAcc_with_ver" 7484 mrnaacc_column = "mrnaAcc_with_ver" 7485 refseqlink_query = f""" 7486 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7487 FROM {refseqlink_table} 7488 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7489 WHERE protAcc_without_ver IS NOT NULL 7490 """ 7491 # Polars Dataframe 7492 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7493 7494 # Read RefSeq transcripts into a python dict/model. 7495 log.debug(f"Transcripts loading...") 7496 with tempfile.TemporaryDirectory() as tmpdir: 7497 transcripts_query = f""" 7498 COPY ( 7499 SELECT {refseq_table}.* 7500 FROM {refseq_table} 7501 JOIN df_variants ON ( 7502 {refseq_table}.chrom=df_variants.CHROM 7503 AND {refseq_table}.txStart<=df_variants.POS 7504 AND {refseq_table}.txEnd>=df_variants.POS 7505 ) 7506 ) 7507 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7508 """ 7509 self.conn.query(transcripts_query) 7510 with open(f"{tmpdir}/transcript.tsv") as infile: 7511 transcripts = read_transcripts(infile) 7512 7513 # Polars connexion 7514 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7515 7516 log.debug("Genome loading...") 7517 # Read genome sequence using pyfaidx. 7518 genome = Fasta(genome_file) 7519 7520 log.debug("Start annotation HGVS...") 7521 7522 # Create 7523 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7524 ddf = dd.from_pandas(df_variants, npartitions=threads) 7525 7526 # Use dask.dataframe.apply() to apply function on each partition 7527 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7528 7529 # Convert Dask DataFrame to Pandas Dataframe 7530 df = ddf.compute() 7531 7532 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7533 with tempfile.TemporaryDirectory() as tmpdir: 7534 df_parquet = os.path.join(tmpdir, "df.parquet") 7535 df.to_parquet(df_parquet) 7536 7537 # Update hgvs column 7538 update_variant_query = f""" 7539 UPDATE {table_variants} 7540 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7541 FROM read_parquet('{df_parquet}') as df 7542 WHERE variants."#CHROM" = df.CHROM 7543 AND variants.POS = df.POS 7544 AND variants.REF = df.REF 7545 AND variants.ALT = df.ALT 7546 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7547 """ 7548 self.execute_query(update_variant_query) 7549 7550 # Update INFO column 7551 sql_query_update = f""" 7552 UPDATE {table_variants} 7553 SET INFO = 7554 concat( 7555 CASE 7556 WHEN INFO NOT IN ('','.') 7557 THEN concat(INFO, ';') 7558 ELSE '' 7559 END, 7560 'hgvs=', 7561 {hgvs_column_name} 7562 ) 7563 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7564 """ 7565 self.execute_query(sql_query_update) 7566 7567 # Add header 7568 HGVS_INFOS = { 7569 "hgvs": { 7570 "ID": "hgvs", 7571 "Number": ".", 7572 "Type": "String", 7573 "Description": f"HGVS annotatation with HOWARD", 7574 } 7575 } 7576 7577 for field in HGVS_INFOS: 7578 field_ID = HGVS_INFOS[field]["ID"] 7579 field_description = HGVS_INFOS[field]["Description"] 7580 self.get_header().infos[field_ID] = vcf.parser._Info( 7581 field_ID, 7582 HGVS_INFOS[field]["Number"], 7583 HGVS_INFOS[field]["Type"], 7584 field_description, 7585 "unknown", 7586 "unknown", 7587 code_type_map[HGVS_INFOS[field]["Type"]], 7588 ) 7589 7590 # Remove added columns 7591 for added_column in added_columns: 7592 self.drop_column(column=added_column) 7593 7594 ### 7595 # Calculation 7596 ### 7597 7598 def get_operations_help( 7599 self, operations_config_dict: dict = {}, operations_config_file: str = None 7600 ) -> list: 7601 7602 # Init 7603 operations_help = [] 7604 7605 # operations 7606 operations = self.get_config_json( 7607 name="calculations", 7608 config_dict=operations_config_dict, 7609 config_file=operations_config_file, 7610 ) 7611 for op in operations: 7612 op_name = operations[op].get("name", op).upper() 7613 op_description = operations[op].get("description", op_name) 7614 op_available = operations[op].get("available", False) 7615 if op_available: 7616 operations_help.append(f" {op_name}: {op_description}") 7617 7618 # Sort operations 7619 operations_help.sort() 7620 7621 # insert header 7622 operations_help.insert(0, "Available calculation operations:") 7623 7624 # Return 7625 return operations_help 7626 7627 def calculation( 7628 self, 7629 operations: dict = {}, 7630 operations_config_dict: dict = {}, 7631 operations_config_file: str = None, 7632 ) -> None: 7633 """ 7634 It takes a list of operations, and for each operation, it checks if it's a python or sql 7635 operation, and then calls the appropriate function 7636 7637 param json example: 7638 "calculation": { 7639 "NOMEN": { 7640 "options": { 7641 "hgvs_field": "hgvs" 7642 }, 7643 "middle" : null 7644 } 7645 """ 7646 7647 # Param 7648 param = self.get_param() 7649 7650 # operations config 7651 operations_config = self.get_config_json( 7652 name="calculations", 7653 config_dict=operations_config_dict, 7654 config_file=operations_config_file, 7655 ) 7656 7657 # Upper keys 7658 operations_config = {k.upper(): v for k, v in operations_config.items()} 7659 7660 # Calculations 7661 7662 # Operations from param 7663 operations = param.get("calculation", {}).get("calculations", operations) 7664 7665 # Quick calculation - add 7666 if param.get("calculations", None): 7667 calculations_list = [ 7668 value for value in param.get("calculations", "").split(",") 7669 ] 7670 log.info(f"Quick Calculations:") 7671 for calculation_key in calculations_list: 7672 log.info(f" {calculation_key}") 7673 for calculation_operation in calculations_list: 7674 if calculation_operation.upper() not in operations: 7675 operations[calculation_operation.upper()] = {} 7676 add_value_into_dict( 7677 dict_tree=param, 7678 sections=[ 7679 "calculation", 7680 "calculations", 7681 calculation_operation.upper(), 7682 ], 7683 value={}, 7684 ) 7685 7686 # Operations for calculation 7687 if not operations: 7688 operations = param.get("calculation", {}).get("calculations", {}) 7689 7690 if operations: 7691 log.info(f"Calculations...") 7692 7693 # For each operations 7694 for operation_name in operations: 7695 operation_name = operation_name.upper() 7696 if operation_name not in [""]: 7697 if operation_name in operations_config: 7698 log.info(f"Calculation '{operation_name}'") 7699 operation = operations_config[operation_name] 7700 operation_type = operation.get("type", "sql") 7701 if operation_type == "python": 7702 self.calculation_process_function( 7703 operation=operation, operation_name=operation_name 7704 ) 7705 elif operation_type == "sql": 7706 self.calculation_process_sql( 7707 operation=operation, operation_name=operation_name 7708 ) 7709 else: 7710 log.error( 7711 f"Operations config: Type '{operation_type}' NOT available" 7712 ) 7713 raise ValueError( 7714 f"Operations config: Type '{operation_type}' NOT available" 7715 ) 7716 else: 7717 log.error( 7718 f"Operations config: Calculation '{operation_name}' NOT available" 7719 ) 7720 raise ValueError( 7721 f"Operations config: Calculation '{operation_name}' NOT available" 7722 ) 7723 7724 # Explode INFOS fields into table fields 7725 if self.get_explode_infos(): 7726 self.explode_infos( 7727 prefix=self.get_explode_infos_prefix(), 7728 fields=self.get_explode_infos_fields(), 7729 force=True, 7730 ) 7731 7732 def calculation_process_sql( 7733 self, operation: dict, operation_name: str = "unknown" 7734 ) -> None: 7735 """ 7736 The `calculation_process_sql` function takes in a mathematical operation as a string and 7737 performs the operation, updating the specified table with the result. 7738 7739 :param operation: The `operation` parameter is a dictionary that contains information about the 7740 mathematical operation to be performed. It includes the following keys: 7741 :type operation: dict 7742 :param operation_name: The `operation_name` parameter is a string that represents the name of 7743 the mathematical operation being performed. It is used for logging and error handling purposes, 7744 defaults to unknown 7745 :type operation_name: str (optional) 7746 """ 7747 7748 # table variants 7749 table_variants = self.get_table_variants(clause="alter") 7750 7751 # Operation infos 7752 operation_name = operation.get("name", "unknown") 7753 log.debug(f"process sql {operation_name}") 7754 output_column_name = operation.get("output_column_name", operation_name) 7755 output_column_type = operation.get("output_column_type", "String") 7756 prefix = operation.get("explode_infos_prefix", "") 7757 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7758 output_column_description = operation.get( 7759 "output_column_description", f"{operation_name} operation" 7760 ) 7761 operation_query = operation.get("operation_query", None) 7762 if isinstance(operation_query, list): 7763 operation_query = " ".join(operation_query) 7764 operation_info_fields = operation.get("info_fields", []) 7765 operation_info_fields_check = operation.get("info_fields_check", False) 7766 operation_info = operation.get("operation_info", True) 7767 7768 if operation_query: 7769 7770 # Info fields check 7771 operation_info_fields_check_result = True 7772 if operation_info_fields_check: 7773 header_infos = self.get_header().infos 7774 for info_field in operation_info_fields: 7775 operation_info_fields_check_result = ( 7776 operation_info_fields_check_result 7777 and info_field in header_infos 7778 ) 7779 7780 # If info fields available 7781 if operation_info_fields_check_result: 7782 7783 # Added_columns 7784 added_columns = [] 7785 7786 # Create VCF header field 7787 vcf_reader = self.get_header() 7788 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7789 output_column_name, 7790 ".", 7791 output_column_type, 7792 output_column_description, 7793 "howard calculation", 7794 "0", 7795 self.code_type_map.get(output_column_type), 7796 ) 7797 7798 # Explode infos if needed 7799 log.debug(f"calculation_process_sql prefix {prefix}") 7800 added_columns += self.explode_infos( 7801 prefix=prefix, 7802 fields=[output_column_name] + operation_info_fields, 7803 force=True, 7804 ) 7805 7806 # Create column 7807 added_column = self.add_column( 7808 table_name=table_variants, 7809 column_name=prefix + output_column_name, 7810 column_type=output_column_type_sql, 7811 default_value="null", 7812 ) 7813 added_columns.append(added_column) 7814 7815 # Operation calculation 7816 try: 7817 7818 # Query to update calculation column 7819 sql_update = f""" 7820 UPDATE {table_variants} 7821 SET "{prefix}{output_column_name}" = ({operation_query}) 7822 """ 7823 self.conn.execute(sql_update) 7824 7825 # Add to INFO 7826 if operation_info: 7827 sql_update_info = f""" 7828 UPDATE {table_variants} 7829 SET "INFO" = 7830 concat( 7831 CASE 7832 WHEN "INFO" IS NOT NULL 7833 THEN concat("INFO", ';') 7834 ELSE '' 7835 END, 7836 '{output_column_name}=', 7837 "{prefix}{output_column_name}" 7838 ) 7839 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7840 """ 7841 self.conn.execute(sql_update_info) 7842 7843 except: 7844 log.error( 7845 f"Operations config: Calculation '{operation_name}' query failed" 7846 ) 7847 raise ValueError( 7848 f"Operations config: Calculation '{operation_name}' query failed" 7849 ) 7850 7851 # Remove added columns 7852 for added_column in added_columns: 7853 log.debug(f"added_column: {added_column}") 7854 self.drop_column(column=added_column) 7855 7856 else: 7857 log.error( 7858 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7859 ) 7860 raise ValueError( 7861 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7862 ) 7863 7864 else: 7865 log.error( 7866 f"Operations config: Calculation '{operation_name}' query NOT defined" 7867 ) 7868 raise ValueError( 7869 f"Operations config: Calculation '{operation_name}' query NOT defined" 7870 ) 7871 7872 def calculation_process_function( 7873 self, operation: dict, operation_name: str = "unknown" 7874 ) -> None: 7875 """ 7876 The `calculation_process_function` takes in an operation dictionary and performs the specified 7877 function with the given parameters. 7878 7879 :param operation: The `operation` parameter is a dictionary that contains information about the 7880 operation to be performed. It has the following keys: 7881 :type operation: dict 7882 :param operation_name: The `operation_name` parameter is a string that represents the name of 7883 the operation being performed. It is used for logging purposes, defaults to unknown 7884 :type operation_name: str (optional) 7885 """ 7886 7887 operation_name = operation["name"] 7888 log.debug(f"process sql {operation_name}") 7889 function_name = operation["function_name"] 7890 function_params = operation["function_params"] 7891 getattr(self, function_name)(*function_params) 7892 7893 def calculation_variant_id(self) -> None: 7894 """ 7895 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7896 updates the INFO field of a variants table with the variant ID. 7897 """ 7898 7899 # variant_id annotation field 7900 variant_id_tag = self.get_variant_id_column() 7901 added_columns = [variant_id_tag] 7902 7903 # variant_id hgvs tags" 7904 vcf_infos_tags = { 7905 variant_id_tag: "howard variant ID annotation", 7906 } 7907 7908 # Variants table 7909 table_variants = self.get_table_variants() 7910 7911 # Header 7912 vcf_reader = self.get_header() 7913 7914 # Add variant_id to header 7915 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7916 variant_id_tag, 7917 ".", 7918 "String", 7919 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7920 "howard calculation", 7921 "0", 7922 self.code_type_map.get("String"), 7923 ) 7924 7925 # Update 7926 sql_update = f""" 7927 UPDATE {table_variants} 7928 SET "INFO" = 7929 concat( 7930 CASE 7931 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7932 THEN '' 7933 ELSE concat("INFO", ';') 7934 END, 7935 '{variant_id_tag}=', 7936 "{variant_id_tag}" 7937 ) 7938 """ 7939 self.conn.execute(sql_update) 7940 7941 # Remove added columns 7942 for added_column in added_columns: 7943 self.drop_column(column=added_column) 7944 7945 def calculation_extract_snpeff_hgvs( 7946 self, 7947 snpeff_hgvs: str = "snpeff_hgvs", 7948 snpeff_field: str = "ANN", 7949 ) -> None: 7950 """ 7951 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7952 annotation field in a VCF file and adds them as a new column in the variants table. 7953 7954 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 7955 function is used to specify the name of the column that will store the HGVS nomenclatures 7956 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 7957 snpeff_hgvs 7958 :type snpeff_hgvs: str (optional) 7959 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 7960 function represents the field in the VCF file that contains SnpEff annotations. This field is 7961 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 7962 to ANN 7963 :type snpeff_field: str (optional) 7964 """ 7965 7966 # Snpeff hgvs tags 7967 vcf_infos_tags = { 7968 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7969 } 7970 7971 # Prefix 7972 prefix = self.get_explode_infos_prefix() 7973 if prefix: 7974 prefix = "INFO/" 7975 7976 # snpEff fields 7977 speff_ann_infos = prefix + snpeff_field 7978 speff_hgvs_infos = prefix + snpeff_hgvs 7979 7980 # Variants table 7981 table_variants = self.get_table_variants() 7982 7983 # Header 7984 vcf_reader = self.get_header() 7985 7986 # Add columns 7987 added_columns = [] 7988 7989 # Explode HGVS field in column 7990 added_columns += self.explode_infos(fields=[snpeff_field]) 7991 7992 if snpeff_field in vcf_reader.infos: 7993 7994 log.debug(vcf_reader.infos[snpeff_field]) 7995 7996 # Extract ANN header 7997 ann_description = vcf_reader.infos[snpeff_field].desc 7998 pattern = r"'(.+?)'" 7999 match = re.search(pattern, ann_description) 8000 if match: 8001 ann_header_match = match.group(1).split(" | ") 8002 ann_header_desc = {} 8003 for i in range(len(ann_header_match)): 8004 ann_header_info = "".join( 8005 char for char in ann_header_match[i] if char.isalnum() 8006 ) 8007 ann_header_desc[ann_header_info] = ann_header_match[i] 8008 if not ann_header_desc: 8009 raise ValueError("Invalid header description format") 8010 else: 8011 raise ValueError("Invalid header description format") 8012 8013 # Create variant id 8014 variant_id_column = self.get_variant_id_column() 8015 added_columns += [variant_id_column] 8016 8017 # Create dataframe 8018 dataframe_snpeff_hgvs = self.get_query_to_df( 8019 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8020 ) 8021 8022 # Create main NOMEN column 8023 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8024 speff_ann_infos 8025 ].apply( 8026 lambda x: extract_snpeff_hgvs( 8027 str(x), header=list(ann_header_desc.values()) 8028 ) 8029 ) 8030 8031 # Add snpeff_hgvs to header 8032 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8033 snpeff_hgvs, 8034 ".", 8035 "String", 8036 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8037 "howard calculation", 8038 "0", 8039 self.code_type_map.get("String"), 8040 ) 8041 8042 # Update 8043 sql_update = f""" 8044 UPDATE variants 8045 SET "INFO" = 8046 concat( 8047 CASE 8048 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8049 THEN '' 8050 ELSE concat("INFO", ';') 8051 END, 8052 CASE 8053 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8054 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8055 THEN concat( 8056 '{snpeff_hgvs}=', 8057 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8058 ) 8059 ELSE '' 8060 END 8061 ) 8062 FROM dataframe_snpeff_hgvs 8063 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8064 8065 """ 8066 self.conn.execute(sql_update) 8067 8068 # Delete dataframe 8069 del dataframe_snpeff_hgvs 8070 gc.collect() 8071 8072 else: 8073 8074 log.warning( 8075 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8076 ) 8077 8078 # Remove added columns 8079 for added_column in added_columns: 8080 self.drop_column(column=added_column) 8081 8082 def calculation_snpeff_ann_explode( 8083 self, 8084 uniquify: bool = True, 8085 output_format: str = "fields", 8086 output_prefix: str = "snpeff_", 8087 snpeff_field: str = "ANN", 8088 ) -> None: 8089 """ 8090 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8091 exploding the HGVS field and updating variant information accordingly. 8092 8093 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8094 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8095 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8096 defaults to True 8097 :type uniquify: bool (optional) 8098 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8099 function specifies the format in which the output annotations will be generated. It has a 8100 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8101 format, defaults to fields 8102 :type output_format: str (optional) 8103 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8104 method is used to specify the prefix that will be added to the output annotations generated 8105 during the calculation process. This prefix helps to differentiate the newly added annotations 8106 from existing ones in the output data. By default, the, defaults to ANN_ 8107 :type output_prefix: str (optional) 8108 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8109 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8110 field will be processed to explode the HGVS annotations and update the variant information 8111 accordingly, defaults to ANN 8112 :type snpeff_field: str (optional) 8113 """ 8114 8115 # SnpEff annotation field 8116 snpeff_hgvs = "snpeff_ann_explode" 8117 8118 # Snpeff hgvs tags 8119 vcf_infos_tags = { 8120 snpeff_hgvs: "Explode snpEff annotations", 8121 } 8122 8123 # Prefix 8124 prefix = self.get_explode_infos_prefix() 8125 if prefix: 8126 prefix = "INFO/" 8127 8128 # snpEff fields 8129 speff_ann_infos = prefix + snpeff_field 8130 speff_hgvs_infos = prefix + snpeff_hgvs 8131 8132 # Variants table 8133 table_variants = self.get_table_variants() 8134 8135 # Header 8136 vcf_reader = self.get_header() 8137 8138 # Add columns 8139 added_columns = [] 8140 8141 # Explode HGVS field in column 8142 added_columns += self.explode_infos(fields=[snpeff_field]) 8143 log.debug(f"snpeff_field={snpeff_field}") 8144 log.debug(f"added_columns={added_columns}") 8145 8146 if snpeff_field in vcf_reader.infos: 8147 8148 # Extract ANN header 8149 ann_description = vcf_reader.infos[snpeff_field].desc 8150 pattern = r"'(.+?)'" 8151 match = re.search(pattern, ann_description) 8152 if match: 8153 ann_header_match = match.group(1).split(" | ") 8154 ann_header = [] 8155 ann_header_desc = {} 8156 for i in range(len(ann_header_match)): 8157 ann_header_info = "".join( 8158 char for char in ann_header_match[i] if char.isalnum() 8159 ) 8160 ann_header.append(ann_header_info) 8161 ann_header_desc[ann_header_info] = ann_header_match[i] 8162 if not ann_header_desc: 8163 raise ValueError("Invalid header description format") 8164 else: 8165 raise ValueError("Invalid header description format") 8166 8167 # Create variant id 8168 variant_id_column = self.get_variant_id_column() 8169 added_columns += [variant_id_column] 8170 8171 # Create dataframe 8172 dataframe_snpeff_hgvs = self.get_query_to_df( 8173 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8174 ) 8175 8176 # Create snpEff columns 8177 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8178 speff_ann_infos 8179 ].apply( 8180 lambda x: explode_snpeff_ann( 8181 str(x), 8182 uniquify=uniquify, 8183 output_format=output_format, 8184 prefix=output_prefix, 8185 header=list(ann_header_desc.values()), 8186 ) 8187 ) 8188 8189 # Header 8190 ann_annotations_prefix = "" 8191 if output_format.upper() in ["JSON"]: 8192 ann_annotations_prefix = f"{output_prefix}=" 8193 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8194 output_prefix, 8195 ".", 8196 "String", 8197 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8198 + " - JSON format", 8199 "howard calculation", 8200 "0", 8201 self.code_type_map.get("String"), 8202 ) 8203 else: 8204 for ann_annotation in ann_header: 8205 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8206 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8207 ann_annotation_id, 8208 ".", 8209 "String", 8210 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8211 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8212 "howard calculation", 8213 "0", 8214 self.code_type_map.get("String"), 8215 ) 8216 8217 # Update 8218 sql_update = f""" 8219 UPDATE variants 8220 SET "INFO" = 8221 concat( 8222 CASE 8223 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8224 THEN '' 8225 ELSE concat("INFO", ';') 8226 END, 8227 CASE 8228 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8229 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8230 THEN concat( 8231 '{ann_annotations_prefix}', 8232 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8233 ) 8234 ELSE '' 8235 END 8236 ) 8237 FROM dataframe_snpeff_hgvs 8238 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8239 8240 """ 8241 self.conn.execute(sql_update) 8242 8243 # Delete dataframe 8244 del dataframe_snpeff_hgvs 8245 gc.collect() 8246 8247 else: 8248 8249 log.warning( 8250 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8251 ) 8252 8253 # Remove added columns 8254 for added_column in added_columns: 8255 self.drop_column(column=added_column) 8256 8257 def calculation_extract_nomen(self) -> None: 8258 """ 8259 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8260 """ 8261 8262 # NOMEN field 8263 field_nomen_dict = "NOMEN_DICT" 8264 8265 # NOMEN structure 8266 nomen_dict = { 8267 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8268 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8269 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8270 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8271 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8272 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8273 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8274 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8275 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8276 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8277 } 8278 8279 # Param 8280 param = self.get_param() 8281 8282 # Prefix 8283 prefix = self.get_explode_infos_prefix() 8284 8285 # Header 8286 vcf_reader = self.get_header() 8287 8288 # Get HGVS field 8289 hgvs_field = ( 8290 param.get("calculation", {}) 8291 .get("calculations", {}) 8292 .get("NOMEN", {}) 8293 .get("options", {}) 8294 .get("hgvs_field", "hgvs") 8295 ) 8296 8297 # Get transcripts 8298 transcripts_file = ( 8299 param.get("calculation", {}) 8300 .get("calculations", {}) 8301 .get("NOMEN", {}) 8302 .get("options", {}) 8303 .get("transcripts", None) 8304 ) 8305 transcripts_file = full_path(transcripts_file) 8306 transcripts = [] 8307 if transcripts_file: 8308 if os.path.exists(transcripts_file): 8309 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8310 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8311 else: 8312 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8313 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8314 8315 # Added columns 8316 added_columns = [] 8317 8318 # Explode HGVS field in column 8319 added_columns += self.explode_infos(fields=[hgvs_field]) 8320 8321 # extra infos 8322 extra_infos = self.get_extra_infos() 8323 extra_field = prefix + hgvs_field 8324 8325 if extra_field in extra_infos: 8326 8327 # Create dataframe 8328 dataframe_hgvs = self.get_query_to_df( 8329 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8330 ) 8331 8332 # Create main NOMEN column 8333 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8334 lambda x: find_nomen(str(x), transcripts=transcripts) 8335 ) 8336 8337 # Explode NOMEN Structure and create SQL set for update 8338 sql_nomen_fields = [] 8339 for nomen_field in nomen_dict: 8340 8341 # Explode each field into a column 8342 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8343 lambda x: dict(x).get(nomen_field, "") 8344 ) 8345 8346 # Create VCF header field 8347 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8348 nomen_field, 8349 ".", 8350 "String", 8351 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8352 "howard calculation", 8353 "0", 8354 self.code_type_map.get("String"), 8355 ) 8356 sql_nomen_fields.append( 8357 f""" 8358 CASE 8359 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8360 THEN concat( 8361 ';{nomen_field}=', 8362 dataframe_hgvs."{nomen_field}" 8363 ) 8364 ELSE '' 8365 END 8366 """ 8367 ) 8368 8369 # SQL set for update 8370 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8371 8372 # Update 8373 sql_update = f""" 8374 UPDATE variants 8375 SET "INFO" = 8376 concat( 8377 CASE 8378 WHEN "INFO" IS NULL 8379 THEN '' 8380 ELSE "INFO" 8381 END, 8382 {sql_nomen_fields_set} 8383 ) 8384 FROM dataframe_hgvs 8385 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8386 AND variants."POS" = dataframe_hgvs."POS" 8387 AND variants."REF" = dataframe_hgvs."REF" 8388 AND variants."ALT" = dataframe_hgvs."ALT" 8389 """ 8390 self.conn.execute(sql_update) 8391 8392 # Delete dataframe 8393 del dataframe_hgvs 8394 gc.collect() 8395 8396 # Remove added columns 8397 for added_column in added_columns: 8398 self.drop_column(column=added_column) 8399 8400 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8401 """ 8402 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8403 pipeline/sample for a variant and updates the variant information in a VCF file. 8404 8405 :param tag: The `tag` parameter is a string that represents the annotation field for the 8406 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8407 VCF header and to update the corresponding field in the variants table, defaults to 8408 findbypipeline 8409 :type tag: str (optional) 8410 """ 8411 8412 # if FORMAT and samples 8413 if ( 8414 "FORMAT" in self.get_header_columns_as_list() 8415 and self.get_header_sample_list() 8416 ): 8417 8418 # findbypipeline annotation field 8419 findbypipeline_tag = tag 8420 8421 # VCF infos tags 8422 vcf_infos_tags = { 8423 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8424 } 8425 8426 # Prefix 8427 prefix = self.get_explode_infos_prefix() 8428 8429 # Field 8430 findbypipeline_infos = prefix + findbypipeline_tag 8431 8432 # Variants table 8433 table_variants = self.get_table_variants() 8434 8435 # Header 8436 vcf_reader = self.get_header() 8437 8438 # Create variant id 8439 variant_id_column = self.get_variant_id_column() 8440 added_columns = [variant_id_column] 8441 8442 # variant_id, FORMAT and samples 8443 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8444 self.get_header_sample_list() 8445 ) 8446 8447 # Create dataframe 8448 dataframe_findbypipeline = self.get_query_to_df( 8449 f""" SELECT {samples_fields} FROM {table_variants} """ 8450 ) 8451 8452 # Create findbypipeline column 8453 dataframe_findbypipeline[findbypipeline_infos] = ( 8454 dataframe_findbypipeline.apply( 8455 lambda row: findbypipeline( 8456 row, samples=self.get_header_sample_list() 8457 ), 8458 axis=1, 8459 ) 8460 ) 8461 8462 # Add snpeff_hgvs to header 8463 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8464 findbypipeline_tag, 8465 ".", 8466 "String", 8467 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8468 "howard calculation", 8469 "0", 8470 self.code_type_map.get("String"), 8471 ) 8472 8473 # Update 8474 sql_update = f""" 8475 UPDATE variants 8476 SET "INFO" = 8477 concat( 8478 CASE 8479 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8480 THEN '' 8481 ELSE concat("INFO", ';') 8482 END, 8483 CASE 8484 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8485 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8486 THEN concat( 8487 '{findbypipeline_tag}=', 8488 dataframe_findbypipeline."{findbypipeline_infos}" 8489 ) 8490 ELSE '' 8491 END 8492 ) 8493 FROM dataframe_findbypipeline 8494 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8495 """ 8496 self.conn.execute(sql_update) 8497 8498 # Remove added columns 8499 for added_column in added_columns: 8500 self.drop_column(column=added_column) 8501 8502 # Delete dataframe 8503 del dataframe_findbypipeline 8504 gc.collect() 8505 8506 def calculation_genotype_concordance(self) -> None: 8507 """ 8508 The function `calculation_genotype_concordance` calculates the genotype concordance for 8509 multi-caller VCF files and updates the variant information in the database. 8510 """ 8511 8512 # if FORMAT and samples 8513 if ( 8514 "FORMAT" in self.get_header_columns_as_list() 8515 and self.get_header_sample_list() 8516 ): 8517 8518 # genotypeconcordance annotation field 8519 genotypeconcordance_tag = "genotypeconcordance" 8520 8521 # VCF infos tags 8522 vcf_infos_tags = { 8523 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8524 } 8525 8526 # Prefix 8527 prefix = self.get_explode_infos_prefix() 8528 8529 # Field 8530 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8531 8532 # Variants table 8533 table_variants = self.get_table_variants() 8534 8535 # Header 8536 vcf_reader = self.get_header() 8537 8538 # Create variant id 8539 variant_id_column = self.get_variant_id_column() 8540 added_columns = [variant_id_column] 8541 8542 # variant_id, FORMAT and samples 8543 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8544 self.get_header_sample_list() 8545 ) 8546 8547 # Create dataframe 8548 dataframe_genotypeconcordance = self.get_query_to_df( 8549 f""" SELECT {samples_fields} FROM {table_variants} """ 8550 ) 8551 8552 # Create genotypeconcordance column 8553 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8554 dataframe_genotypeconcordance.apply( 8555 lambda row: genotypeconcordance( 8556 row, samples=self.get_header_sample_list() 8557 ), 8558 axis=1, 8559 ) 8560 ) 8561 8562 # Add genotypeconcordance to header 8563 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8564 genotypeconcordance_tag, 8565 ".", 8566 "String", 8567 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8568 "howard calculation", 8569 "0", 8570 self.code_type_map.get("String"), 8571 ) 8572 8573 # Update 8574 sql_update = f""" 8575 UPDATE variants 8576 SET "INFO" = 8577 concat( 8578 CASE 8579 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8580 THEN '' 8581 ELSE concat("INFO", ';') 8582 END, 8583 CASE 8584 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8585 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8586 THEN concat( 8587 '{genotypeconcordance_tag}=', 8588 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8589 ) 8590 ELSE '' 8591 END 8592 ) 8593 FROM dataframe_genotypeconcordance 8594 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8595 """ 8596 self.conn.execute(sql_update) 8597 8598 # Remove added columns 8599 for added_column in added_columns: 8600 self.drop_column(column=added_column) 8601 8602 # Delete dataframe 8603 del dataframe_genotypeconcordance 8604 gc.collect() 8605 8606 def calculation_barcode(self, tag: str = "barcode") -> None: 8607 """ 8608 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8609 updates the INFO field in the file with the calculated barcode values. 8610 8611 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8612 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8613 the default tag name is set to "barcode", defaults to barcode 8614 :type tag: str (optional) 8615 """ 8616 8617 # if FORMAT and samples 8618 if ( 8619 "FORMAT" in self.get_header_columns_as_list() 8620 and self.get_header_sample_list() 8621 ): 8622 8623 # barcode annotation field 8624 if not tag: 8625 tag = "barcode" 8626 8627 # VCF infos tags 8628 vcf_infos_tags = { 8629 tag: "barcode calculation (VaRank)", 8630 } 8631 8632 # Prefix 8633 prefix = self.get_explode_infos_prefix() 8634 8635 # Field 8636 barcode_infos = prefix + tag 8637 8638 # Variants table 8639 table_variants = self.get_table_variants() 8640 8641 # Header 8642 vcf_reader = self.get_header() 8643 8644 # Create variant id 8645 variant_id_column = self.get_variant_id_column() 8646 added_columns = [variant_id_column] 8647 8648 # variant_id, FORMAT and samples 8649 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8650 self.get_header_sample_list() 8651 ) 8652 8653 # Create dataframe 8654 dataframe_barcode = self.get_query_to_df( 8655 f""" SELECT {samples_fields} FROM {table_variants} """ 8656 ) 8657 8658 # Create barcode column 8659 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8660 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8661 ) 8662 8663 # Add barcode to header 8664 vcf_reader.infos[tag] = vcf.parser._Info( 8665 tag, 8666 ".", 8667 "String", 8668 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8669 "howard calculation", 8670 "0", 8671 self.code_type_map.get("String"), 8672 ) 8673 8674 # Update 8675 sql_update = f""" 8676 UPDATE {table_variants} 8677 SET "INFO" = 8678 concat( 8679 CASE 8680 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8681 THEN '' 8682 ELSE concat("INFO", ';') 8683 END, 8684 CASE 8685 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8686 AND dataframe_barcode."{barcode_infos}" NOT NULL 8687 THEN concat( 8688 '{tag}=', 8689 dataframe_barcode."{barcode_infos}" 8690 ) 8691 ELSE '' 8692 END 8693 ) 8694 FROM dataframe_barcode 8695 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8696 """ 8697 self.conn.execute(sql_update) 8698 8699 # Remove added columns 8700 for added_column in added_columns: 8701 self.drop_column(column=added_column) 8702 8703 # Delete dataframe 8704 del dataframe_barcode 8705 gc.collect() 8706 8707 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8708 """ 8709 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8710 and updates the INFO field in the file with the calculated barcode values. 8711 8712 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8713 the barcode tag that will be added to the VCF file during the calculation process. If no value 8714 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8715 :type tag: str (optional) 8716 """ 8717 8718 # if FORMAT and samples 8719 if ( 8720 "FORMAT" in self.get_header_columns_as_list() 8721 and self.get_header_sample_list() 8722 ): 8723 8724 # barcode annotation field 8725 if not tag: 8726 tag = "BCF" 8727 8728 # VCF infos tags 8729 vcf_infos_tags = { 8730 tag: "barcode family calculation", 8731 f"{tag}S": "barcode family samples", 8732 } 8733 8734 # Param 8735 param = self.get_param() 8736 log.debug(f"param={param}") 8737 8738 # Prefix 8739 prefix = self.get_explode_infos_prefix() 8740 8741 # PED param 8742 ped = ( 8743 param.get("calculation", {}) 8744 .get("calculations", {}) 8745 .get("BARCODEFAMILY", {}) 8746 .get("family_pedigree", None) 8747 ) 8748 log.debug(f"ped={ped}") 8749 8750 # Load PED 8751 if ped: 8752 8753 # Pedigree is a file 8754 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8755 log.debug("Pedigree is file") 8756 with open(full_path(ped)) as ped: 8757 ped = json.load(ped) 8758 8759 # Pedigree is a string 8760 elif isinstance(ped, str): 8761 log.debug("Pedigree is str") 8762 try: 8763 ped = json.loads(ped) 8764 log.debug("Pedigree is json str") 8765 except ValueError as e: 8766 ped_samples = ped.split(",") 8767 ped = {} 8768 for ped_sample in ped_samples: 8769 ped[ped_sample] = ped_sample 8770 8771 # Pedigree is a dict 8772 elif isinstance(ped, dict): 8773 log.debug("Pedigree is dict") 8774 8775 # Pedigree is not well formatted 8776 else: 8777 msg_error = "Pedigree not well formatted" 8778 log.error(msg_error) 8779 raise ValueError(msg_error) 8780 8781 # Construct list 8782 ped_samples = list(ped.values()) 8783 8784 else: 8785 log.debug("Pedigree not defined. Take all samples") 8786 ped_samples = self.get_header_sample_list() 8787 ped = {} 8788 for ped_sample in ped_samples: 8789 ped[ped_sample] = ped_sample 8790 8791 # Check pedigree 8792 if not ped or len(ped) == 0: 8793 msg_error = f"Error in pedigree: samples {ped_samples}" 8794 log.error(msg_error) 8795 raise ValueError(msg_error) 8796 8797 # Log 8798 log.info( 8799 "Calculation 'BARCODEFAMILY' - Samples: " 8800 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8801 ) 8802 log.debug(f"ped_samples={ped_samples}") 8803 8804 # Field 8805 barcode_infos = prefix + tag 8806 8807 # Variants table 8808 table_variants = self.get_table_variants() 8809 8810 # Header 8811 vcf_reader = self.get_header() 8812 8813 # Create variant id 8814 variant_id_column = self.get_variant_id_column() 8815 added_columns = [variant_id_column] 8816 8817 # variant_id, FORMAT and samples 8818 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8819 ped_samples 8820 ) 8821 8822 # Create dataframe 8823 dataframe_barcode = self.get_query_to_df( 8824 f""" SELECT {samples_fields} FROM {table_variants} """ 8825 ) 8826 8827 # Create barcode column 8828 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8829 lambda row: barcode(row, samples=ped_samples), axis=1 8830 ) 8831 8832 # Add barcode family to header 8833 # Add vaf_normalization to header 8834 vcf_reader.formats[tag] = vcf.parser._Format( 8835 id=tag, 8836 num=".", 8837 type="String", 8838 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8839 type_code=self.code_type_map.get("String"), 8840 ) 8841 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8842 id=f"{tag}S", 8843 num=".", 8844 type="String", 8845 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8846 type_code=self.code_type_map.get("String"), 8847 ) 8848 8849 # Update 8850 # for sample in ped_samples: 8851 sql_update_set = [] 8852 for sample in self.get_header_sample_list() + ["FORMAT"]: 8853 if sample in ped_samples: 8854 value = f'dataframe_barcode."{barcode_infos}"' 8855 value_samples = "'" + ",".join(ped_samples) + "'" 8856 elif sample == "FORMAT": 8857 value = f"'{tag}'" 8858 value_samples = f"'{tag}S'" 8859 else: 8860 value = "'.'" 8861 value_samples = "'.'" 8862 format_regex = r"[a-zA-Z0-9\s]" 8863 sql_update_set.append( 8864 f""" 8865 "{sample}" = 8866 concat( 8867 CASE 8868 WHEN {table_variants}."{sample}" = './.' 8869 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8870 ELSE {table_variants}."{sample}" 8871 END, 8872 ':', 8873 {value}, 8874 ':', 8875 {value_samples} 8876 ) 8877 """ 8878 ) 8879 8880 sql_update_set_join = ", ".join(sql_update_set) 8881 sql_update = f""" 8882 UPDATE {table_variants} 8883 SET {sql_update_set_join} 8884 FROM dataframe_barcode 8885 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8886 """ 8887 self.conn.execute(sql_update) 8888 8889 # Remove added columns 8890 for added_column in added_columns: 8891 self.drop_column(column=added_column) 8892 8893 # Delete dataframe 8894 del dataframe_barcode 8895 gc.collect() 8896 8897 def calculation_trio(self) -> None: 8898 """ 8899 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8900 information to the INFO field of each variant. 8901 """ 8902 8903 # if FORMAT and samples 8904 if ( 8905 "FORMAT" in self.get_header_columns_as_list() 8906 and self.get_header_sample_list() 8907 ): 8908 8909 # trio annotation field 8910 trio_tag = "trio" 8911 8912 # VCF infos tags 8913 vcf_infos_tags = { 8914 "trio": "trio calculation", 8915 } 8916 8917 # Param 8918 param = self.get_param() 8919 8920 # Prefix 8921 prefix = self.get_explode_infos_prefix() 8922 8923 # Trio param 8924 trio_ped = ( 8925 param.get("calculation", {}) 8926 .get("calculations", {}) 8927 .get("TRIO", {}) 8928 .get("trio_pedigree", None) 8929 ) 8930 8931 # Load trio 8932 if trio_ped: 8933 8934 # Trio pedigree is a file 8935 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8936 log.debug("TRIO pedigree is file") 8937 with open(full_path(trio_ped)) as trio_ped: 8938 trio_ped = json.load(trio_ped) 8939 8940 # Trio pedigree is a string 8941 elif isinstance(trio_ped, str): 8942 log.debug("TRIO pedigree is str") 8943 try: 8944 trio_ped = json.loads(trio_ped) 8945 log.debug("TRIO pedigree is json str") 8946 except ValueError as e: 8947 trio_samples = trio_ped.split(",") 8948 if len(trio_samples) == 3: 8949 trio_ped = { 8950 "father": trio_samples[0], 8951 "mother": trio_samples[1], 8952 "child": trio_samples[2], 8953 } 8954 log.debug("TRIO pedigree is list str") 8955 else: 8956 msg_error = "TRIO pedigree not well formatted" 8957 log.error(msg_error) 8958 raise ValueError(msg_error) 8959 8960 # Trio pedigree is a dict 8961 elif isinstance(trio_ped, dict): 8962 log.debug("TRIO pedigree is dict") 8963 8964 # Trio pedigree is not well formatted 8965 else: 8966 msg_error = "TRIO pedigree not well formatted" 8967 log.error(msg_error) 8968 raise ValueError(msg_error) 8969 8970 # Construct trio list 8971 trio_samples = [ 8972 trio_ped.get("father", ""), 8973 trio_ped.get("mother", ""), 8974 trio_ped.get("child", ""), 8975 ] 8976 8977 else: 8978 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8979 samples_list = self.get_header_sample_list() 8980 if len(samples_list) >= 3: 8981 trio_samples = self.get_header_sample_list()[0:3] 8982 trio_ped = { 8983 "father": trio_samples[0], 8984 "mother": trio_samples[1], 8985 "child": trio_samples[2], 8986 } 8987 else: 8988 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8989 log.error(msg_error) 8990 raise ValueError(msg_error) 8991 8992 # Check trio pedigree 8993 if not trio_ped or len(trio_ped) != 3: 8994 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8995 log.error(msg_error) 8996 raise ValueError(msg_error) 8997 8998 # Log 8999 log.info( 9000 f"Calculation 'TRIO' - Samples: " 9001 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9002 ) 9003 9004 # Field 9005 trio_infos = prefix + trio_tag 9006 9007 # Variants table 9008 table_variants = self.get_table_variants() 9009 9010 # Header 9011 vcf_reader = self.get_header() 9012 9013 # Create variant id 9014 variant_id_column = self.get_variant_id_column() 9015 added_columns = [variant_id_column] 9016 9017 # variant_id, FORMAT and samples 9018 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9019 self.get_header_sample_list() 9020 ) 9021 9022 # Create dataframe 9023 dataframe_trio = self.get_query_to_df( 9024 f""" SELECT {samples_fields} FROM {table_variants} """ 9025 ) 9026 9027 # Create trio column 9028 dataframe_trio[trio_infos] = dataframe_trio.apply( 9029 lambda row: trio(row, samples=trio_samples), axis=1 9030 ) 9031 9032 # Add trio to header 9033 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9034 trio_tag, 9035 ".", 9036 "String", 9037 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9038 "howard calculation", 9039 "0", 9040 self.code_type_map.get("String"), 9041 ) 9042 9043 # Update 9044 sql_update = f""" 9045 UPDATE {table_variants} 9046 SET "INFO" = 9047 concat( 9048 CASE 9049 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9050 THEN '' 9051 ELSE concat("INFO", ';') 9052 END, 9053 CASE 9054 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9055 AND dataframe_trio."{trio_infos}" NOT NULL 9056 THEN concat( 9057 '{trio_tag}=', 9058 dataframe_trio."{trio_infos}" 9059 ) 9060 ELSE '' 9061 END 9062 ) 9063 FROM dataframe_trio 9064 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9065 """ 9066 self.conn.execute(sql_update) 9067 9068 # Remove added columns 9069 for added_column in added_columns: 9070 self.drop_column(column=added_column) 9071 9072 # Delete dataframe 9073 del dataframe_trio 9074 gc.collect() 9075 9076 def calculation_vaf_normalization(self) -> None: 9077 """ 9078 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9079 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9080 :return: The function does not return anything. 9081 """ 9082 9083 # if FORMAT and samples 9084 if ( 9085 "FORMAT" in self.get_header_columns_as_list() 9086 and self.get_header_sample_list() 9087 ): 9088 9089 # vaf_normalization annotation field 9090 vaf_normalization_tag = "VAF" 9091 9092 # VCF infos tags 9093 vcf_infos_tags = { 9094 "VAF": "VAF Variant Frequency", 9095 } 9096 9097 # Prefix 9098 prefix = self.get_explode_infos_prefix() 9099 9100 # Variants table 9101 table_variants = self.get_table_variants() 9102 9103 # Header 9104 vcf_reader = self.get_header() 9105 9106 # Do not calculate if VAF already exists 9107 if "VAF" in vcf_reader.formats: 9108 log.debug("VAF already on genotypes") 9109 return 9110 9111 # Create variant id 9112 variant_id_column = self.get_variant_id_column() 9113 added_columns = [variant_id_column] 9114 9115 # variant_id, FORMAT and samples 9116 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9117 f""" "{sample}" """ for sample in self.get_header_sample_list() 9118 ) 9119 9120 # Create dataframe 9121 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9122 log.debug(f"query={query}") 9123 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9124 9125 vaf_normalization_set = [] 9126 9127 # for each sample vaf_normalization 9128 for sample in self.get_header_sample_list(): 9129 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9130 lambda row: vaf_normalization(row, sample=sample), axis=1 9131 ) 9132 vaf_normalization_set.append( 9133 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9134 ) 9135 9136 # Add VAF to FORMAT 9137 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9138 "FORMAT" 9139 ].apply(lambda x: str(x) + ":VAF") 9140 vaf_normalization_set.append( 9141 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9142 ) 9143 9144 # Add vaf_normalization to header 9145 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9146 id=vaf_normalization_tag, 9147 num="1", 9148 type="Float", 9149 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9150 type_code=self.code_type_map.get("Float"), 9151 ) 9152 9153 # Create fields to add in INFO 9154 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9155 9156 # Update 9157 sql_update = f""" 9158 UPDATE {table_variants} 9159 SET {sql_vaf_normalization_set} 9160 FROM dataframe_vaf_normalization 9161 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9162 9163 """ 9164 self.conn.execute(sql_update) 9165 9166 # Remove added columns 9167 for added_column in added_columns: 9168 self.drop_column(column=added_column) 9169 9170 # Delete dataframe 9171 del dataframe_vaf_normalization 9172 gc.collect() 9173 9174 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9175 """ 9176 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9177 field in a VCF file and updates the INFO column of the variants table with the calculated 9178 statistics. 9179 9180 :param info: The `info` parameter is a string that represents the type of information for which 9181 genotype statistics are calculated. It is used to generate various VCF info tags for the 9182 statistics, such as the number of occurrences, the list of values, the minimum value, the 9183 maximum value, the mean, the median, defaults to VAF 9184 :type info: str (optional) 9185 """ 9186 9187 # if FORMAT and samples 9188 if ( 9189 "FORMAT" in self.get_header_columns_as_list() 9190 and self.get_header_sample_list() 9191 ): 9192 9193 # vaf_stats annotation field 9194 vaf_stats_tag = info + "_stats" 9195 9196 # VCF infos tags 9197 vcf_infos_tags = { 9198 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9199 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9200 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9201 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9202 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9203 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9204 info 9205 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9206 } 9207 9208 # Prefix 9209 prefix = self.get_explode_infos_prefix() 9210 9211 # Field 9212 vaf_stats_infos = prefix + vaf_stats_tag 9213 9214 # Variants table 9215 table_variants = self.get_table_variants() 9216 9217 # Header 9218 vcf_reader = self.get_header() 9219 9220 # Create variant id 9221 variant_id_column = self.get_variant_id_column() 9222 added_columns = [variant_id_column] 9223 9224 # variant_id, FORMAT and samples 9225 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9226 self.get_header_sample_list() 9227 ) 9228 9229 # Create dataframe 9230 dataframe_vaf_stats = self.get_query_to_df( 9231 f""" SELECT {samples_fields} FROM {table_variants} """ 9232 ) 9233 9234 # Create vaf_stats column 9235 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9236 lambda row: genotype_stats( 9237 row, samples=self.get_header_sample_list(), info=info 9238 ), 9239 axis=1, 9240 ) 9241 9242 # List of vcf tags 9243 sql_vaf_stats_fields = [] 9244 9245 # Check all VAF stats infos 9246 for stat in vcf_infos_tags: 9247 9248 # Extract stats 9249 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9250 lambda x: dict(x).get(stat, "") 9251 ) 9252 9253 # Add snpeff_hgvs to header 9254 vcf_reader.infos[stat] = vcf.parser._Info( 9255 stat, 9256 ".", 9257 "String", 9258 vcf_infos_tags.get(stat, "genotype statistics"), 9259 "howard calculation", 9260 "0", 9261 self.code_type_map.get("String"), 9262 ) 9263 9264 if len(sql_vaf_stats_fields): 9265 sep = ";" 9266 else: 9267 sep = "" 9268 9269 # Create fields to add in INFO 9270 sql_vaf_stats_fields.append( 9271 f""" 9272 CASE 9273 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9274 THEN concat( 9275 '{sep}{stat}=', 9276 dataframe_vaf_stats."{stat}" 9277 ) 9278 ELSE '' 9279 END 9280 """ 9281 ) 9282 9283 # SQL set for update 9284 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9285 9286 # Update 9287 sql_update = f""" 9288 UPDATE {table_variants} 9289 SET "INFO" = 9290 concat( 9291 CASE 9292 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9293 THEN '' 9294 ELSE concat("INFO", ';') 9295 END, 9296 {sql_vaf_stats_fields_set} 9297 ) 9298 FROM dataframe_vaf_stats 9299 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9300 9301 """ 9302 self.conn.execute(sql_update) 9303 9304 # Remove added columns 9305 for added_column in added_columns: 9306 self.drop_column(column=added_column) 9307 9308 # Delete dataframe 9309 del dataframe_vaf_stats 9310 gc.collect() 9311 9312 def calculation_transcripts_annotation( 9313 self, info_json: str = None, info_format: str = None 9314 ) -> None: 9315 """ 9316 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9317 field to it if transcripts are available. 9318 9319 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9320 is a string parameter that represents the information field to be used in the transcripts JSON. 9321 It is used to specify the JSON format for the transcripts information. If no value is provided 9322 when calling the method, it defaults to " 9323 :type info_json: str 9324 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9325 method is a string parameter that specifies the format of the information field to be used in 9326 the transcripts JSON. It is used to define the format of the information field 9327 :type info_format: str 9328 """ 9329 9330 # Create transcripts table 9331 transcripts_table = self.create_transcript_view() 9332 9333 # Add info field 9334 if transcripts_table: 9335 self.transcript_view_to_variants( 9336 transcripts_table=transcripts_table, 9337 transcripts_info_field_json=info_json, 9338 transcripts_info_field_format=info_format, 9339 ) 9340 else: 9341 log.info("No Transcripts to process. Check param.json file configuration") 9342 9343 def calculation_transcripts_prioritization(self) -> None: 9344 """ 9345 The function `calculation_transcripts_prioritization` creates a transcripts table and 9346 prioritizes transcripts based on certain criteria. 9347 """ 9348 9349 # Create transcripts table 9350 transcripts_table = self.create_transcript_view() 9351 9352 # Add info field 9353 if transcripts_table: 9354 self.transcripts_prioritization(transcripts_table=transcripts_table) 9355 else: 9356 log.info("No Transcripts to process. Check param.json file configuration") 9357 9358 ############### 9359 # Transcripts # 9360 ############### 9361 9362 def transcripts_prioritization( 9363 self, transcripts_table: str = None, param: dict = {} 9364 ) -> bool: 9365 """ 9366 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9367 and updates the variants table with the prioritized information. 9368 9369 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9370 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9371 This parameter is used to identify the table where the transcripts data is stored for the 9372 prioritization process 9373 :type transcripts_table: str 9374 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9375 that contains various configuration settings for the prioritization process of transcripts. It 9376 is used to customize the behavior of the prioritization algorithm and includes settings such as 9377 the prefix for prioritization fields, default profiles, and other 9378 :type param: dict 9379 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9380 transcripts prioritization process is successfully completed, and `False` if there are any 9381 issues or if no profile is defined for transcripts prioritization. 9382 """ 9383 9384 log.debug("Start transcripts prioritization...") 9385 9386 # Param 9387 if not param: 9388 param = self.get_param() 9389 9390 # Variants table 9391 table_variants = self.get_table_variants() 9392 log.debug(f"transcripts_table={transcripts_table}") 9393 # Transcripts table 9394 if transcripts_table is None: 9395 log.debug(f"transcripts_table={transcripts_table}") 9396 transcripts_table = self.create_transcript_view( 9397 transcripts_table="transcripts", param=param 9398 ) 9399 log.debug(f"transcripts_table={transcripts_table}") 9400 if transcripts_table is None: 9401 msg_err = "No Transcripts table availalble" 9402 log.error(msg_err) 9403 raise ValueError(msg_err) 9404 9405 # Get transcripts columns 9406 columns_as_list_query = f""" 9407 DESCRIBE {transcripts_table} 9408 """ 9409 columns_as_list = list( 9410 self.get_query_to_df(columns_as_list_query)["column_name"] 9411 ) 9412 9413 # Create INFO if not exists 9414 if "INFO" not in columns_as_list: 9415 query_add_info = f""" 9416 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9417 """ 9418 self.execute_query(query_add_info) 9419 9420 # Prioritization param and Force only PZ Score and Flag 9421 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9422 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9423 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9424 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9425 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9426 pz_profile_default = ( 9427 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9428 ) 9429 9430 # Exit if no profile 9431 if pz_profile_default is None: 9432 log.warning("No profile defined for transcripts prioritization") 9433 return False 9434 9435 # Prioritization 9436 prioritization_result = self.prioritization( 9437 table=transcripts_table, 9438 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9439 ) 9440 if not prioritization_result: 9441 log.warning("Transcripts prioritization not processed") 9442 return False 9443 9444 # Explode PZ fields 9445 self.explode_infos( 9446 table=transcripts_table, 9447 fields=param.get("transcripts", {}) 9448 .get("prioritization", {}) 9449 .get("pzfields", []), 9450 ) 9451 9452 # Export Transcripts prioritization infos to variants table 9453 query_update = f""" 9454 WITH RankedTranscripts AS ( 9455 SELECT 9456 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9457 ROW_NUMBER() OVER ( 9458 PARTITION BY "#CHROM", POS, REF, ALT 9459 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9460 ) AS rn 9461 FROM 9462 {transcripts_table} 9463 ) 9464 UPDATE {table_variants} 9465 SET 9466 INFO = CONCAT(CASE 9467 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9468 THEN '' 9469 ELSE concat("INFO", ';') 9470 END, 9471 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9472 ) 9473 FROM 9474 RankedTranscripts 9475 WHERE 9476 rn = 1 9477 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9478 AND variants."POS" = RankedTranscripts."POS" 9479 AND variants."REF" = RankedTranscripts."REF" 9480 AND variants."ALT" = RankedTranscripts."ALT" 9481 9482 """ 9483 self.execute_query(query=query_update) 9484 9485 # Add PZ Transcript in header 9486 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9487 pz_fields_transcripts, 9488 ".", 9489 "String", 9490 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9491 "unknown", 9492 "unknown", 9493 code_type_map["String"], 9494 ) 9495 9496 # Return 9497 return True 9498 9499 def create_transcript_view_from_columns_map( 9500 self, 9501 transcripts_table: str = "transcripts", 9502 columns_maps: dict = {}, 9503 added_columns: list = [], 9504 temporary_tables: list = None, 9505 annotation_fields: list = None, 9506 ) -> tuple[list, list, list]: 9507 """ 9508 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9509 specified columns mapping for transcripts data. 9510 9511 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9512 the table where the transcripts data is stored or will be stored in the database. This table 9513 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9514 predictions, etc. It defaults to "transcripts, defaults to transcripts 9515 :type transcripts_table: str (optional) 9516 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9517 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9518 represents a mapping configuration for a specific set of columns. It typically includes details such 9519 as the main transcript column and additional information columns 9520 :type columns_maps: dict 9521 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9522 function is a list that stores the additional columns that will be added to the view being created 9523 based on the columns map provided. These columns are generated by exploding the transcript 9524 information columns along with the main transcript column 9525 :type added_columns: list 9526 :param temporary_tables: The `temporary_tables` parameter in the 9527 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9528 tables created during the process of creating a transcript view from a columns map. These temporary 9529 tables are used to store intermediate results or transformations before the final view is generated 9530 :type temporary_tables: list 9531 :param annotation_fields: The `annotation_fields` parameter in the 9532 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9533 for annotation in the query view creation process. These fields are extracted from the 9534 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9535 :type annotation_fields: list 9536 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9537 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9538 """ 9539 9540 log.debug("Start transcrpts view creation from columns map...") 9541 9542 # "from_columns_map": [ 9543 # { 9544 # "transcripts_column": "Ensembl_transcriptid", 9545 # "transcripts_infos_columns": [ 9546 # "genename", 9547 # "Ensembl_geneid", 9548 # "LIST_S2_score", 9549 # "LIST_S2_pred", 9550 # ], 9551 # }, 9552 # { 9553 # "transcripts_column": "Ensembl_transcriptid", 9554 # "transcripts_infos_columns": [ 9555 # "genename", 9556 # "VARITY_R_score", 9557 # "Aloft_pred", 9558 # ], 9559 # }, 9560 # ], 9561 9562 # Init 9563 if temporary_tables is None: 9564 temporary_tables = [] 9565 if annotation_fields is None: 9566 annotation_fields = [] 9567 9568 # Variants table 9569 table_variants = self.get_table_variants() 9570 9571 for columns_map in columns_maps: 9572 9573 # Transcript column 9574 transcripts_column = columns_map.get("transcripts_column", None) 9575 9576 # Transcripts infos columns 9577 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9578 9579 if transcripts_column is not None: 9580 9581 # Explode 9582 added_columns += self.explode_infos( 9583 fields=[transcripts_column] + transcripts_infos_columns 9584 ) 9585 9586 # View clauses 9587 clause_select = [] 9588 for field in [transcripts_column] + transcripts_infos_columns: 9589 clause_select.append( 9590 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9591 ) 9592 if field not in [transcripts_column]: 9593 annotation_fields.append(field) 9594 9595 # Querey View 9596 query = f""" 9597 SELECT 9598 "#CHROM", POS, REF, ALT, 9599 "{transcripts_column}" AS 'transcript', 9600 {", ".join(clause_select)} 9601 FROM ( 9602 SELECT 9603 "#CHROM", POS, REF, ALT, 9604 {", ".join(clause_select)} 9605 FROM {table_variants} 9606 ) 9607 WHERE "{transcripts_column}" IS NOT NULL 9608 """ 9609 9610 # Create temporary table 9611 temporary_table = transcripts_table + "".join( 9612 random.choices(string.ascii_uppercase + string.digits, k=10) 9613 ) 9614 9615 # Temporary_tables 9616 temporary_tables.append(temporary_table) 9617 query_view = f""" 9618 CREATE TEMPORARY TABLE {temporary_table} 9619 AS ({query}) 9620 """ 9621 self.execute_query(query=query_view) 9622 9623 return added_columns, temporary_tables, annotation_fields 9624 9625 def create_transcript_view_from_column_format( 9626 self, 9627 transcripts_table: str = "transcripts", 9628 column_formats: dict = {}, 9629 temporary_tables: list = None, 9630 annotation_fields: list = None, 9631 ) -> tuple[list, list, list]: 9632 """ 9633 The `create_transcript_view_from_column_format` function generates a transcript view based on 9634 specified column formats, adds additional columns and annotation fields, and returns the list of 9635 temporary tables and annotation fields. 9636 9637 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9638 the table containing the transcripts data. This table will be used as the base table for creating 9639 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9640 different table name if needed, defaults to transcripts 9641 :type transcripts_table: str (optional) 9642 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9643 about the columns to be used for creating the transcript view. Each entry in the dictionary 9644 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9645 the provided code snippet: 9646 :type column_formats: dict 9647 :param temporary_tables: The `temporary_tables` parameter in the 9648 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9649 views created during the process of creating a transcript view from a column format. These temporary 9650 views are used to manipulate and extract data before generating the final transcript view. It 9651 :type temporary_tables: list 9652 :param annotation_fields: The `annotation_fields` parameter in the 9653 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9654 that are extracted from the temporary views created during the process. These annotation fields are 9655 obtained by querying the temporary views and extracting the column names excluding specific columns 9656 like `#CH 9657 :type annotation_fields: list 9658 :return: The `create_transcript_view_from_column_format` function returns two lists: 9659 `temporary_tables` and `annotation_fields`. 9660 """ 9661 9662 log.debug("Start transcrpts view creation from column format...") 9663 9664 # "from_column_format": [ 9665 # { 9666 # "transcripts_column": "ANN", 9667 # "transcripts_infos_column": "Feature_ID", 9668 # } 9669 # ], 9670 9671 # Init 9672 if temporary_tables is None: 9673 temporary_tables = [] 9674 if annotation_fields is None: 9675 annotation_fields = [] 9676 9677 for column_format in column_formats: 9678 9679 # annotation field and transcript annotation field 9680 annotation_field = column_format.get("transcripts_column", "ANN") 9681 transcript_annotation = column_format.get( 9682 "transcripts_infos_column", "Feature_ID" 9683 ) 9684 9685 # Temporary View name 9686 temporary_view_name = transcripts_table + "".join( 9687 random.choices(string.ascii_uppercase + string.digits, k=10) 9688 ) 9689 9690 # Create temporary view name 9691 temporary_view_name = self.annotation_format_to_table( 9692 uniquify=True, 9693 annotation_field=annotation_field, 9694 view_name=temporary_view_name, 9695 annotation_id=transcript_annotation, 9696 ) 9697 9698 # Annotation fields 9699 if temporary_view_name: 9700 query_annotation_fields = f""" 9701 SELECT * 9702 FROM ( 9703 DESCRIBE SELECT * 9704 FROM {temporary_view_name} 9705 ) 9706 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9707 """ 9708 df_annotation_fields = self.get_query_to_df( 9709 query=query_annotation_fields 9710 ) 9711 9712 # Add temporary view and annotation fields 9713 temporary_tables.append(temporary_view_name) 9714 annotation_fields += list(set(df_annotation_fields["column_name"])) 9715 9716 return temporary_tables, annotation_fields 9717 9718 def create_transcript_view( 9719 self, 9720 transcripts_table: str = None, 9721 transcripts_table_drop: bool = True, 9722 param: dict = {}, 9723 ) -> str: 9724 """ 9725 The `create_transcript_view` function generates a transcript view by processing data from a 9726 specified table based on provided parameters and structural information. 9727 9728 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9729 is used to specify the name of the table that will store the final transcript view data. If a table 9730 name is not provided, the function will create a new table to store the transcript view data, and by 9731 default,, defaults to transcripts 9732 :type transcripts_table: str (optional) 9733 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9734 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9735 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9736 the function will drop the existing transcripts table if it exists, defaults to True 9737 :type transcripts_table_drop: bool (optional) 9738 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9739 contains information needed to create a transcript view. It includes details such as the structure 9740 of the transcripts, columns mapping, column formats, and other necessary information for generating 9741 the view. This parameter allows for flexibility and customization 9742 :type param: dict 9743 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9744 created or modified during the execution of the function. 9745 """ 9746 9747 log.debug("Start transcripts view creation...") 9748 9749 # Default 9750 transcripts_table_default = "transcripts" 9751 9752 # Param 9753 if not param: 9754 param = self.get_param() 9755 9756 # Struct 9757 struct = param.get("transcripts", {}).get("struct", None) 9758 9759 if struct: 9760 9761 # Transcripts table 9762 if transcripts_table is None: 9763 transcripts_table = param.get("transcripts", {}).get( 9764 "table", transcripts_table_default 9765 ) 9766 9767 # added_columns 9768 added_columns = [] 9769 9770 # Temporary tables 9771 temporary_tables = [] 9772 9773 # Annotation fields 9774 annotation_fields = [] 9775 9776 # from columns map 9777 columns_maps = struct.get("from_columns_map", []) 9778 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 9779 self.create_transcript_view_from_columns_map( 9780 transcripts_table=transcripts_table, 9781 columns_maps=columns_maps, 9782 added_columns=added_columns, 9783 temporary_tables=temporary_tables, 9784 annotation_fields=annotation_fields, 9785 ) 9786 ) 9787 added_columns += added_columns_tmp 9788 temporary_tables += temporary_tables_tmp 9789 annotation_fields += annotation_fields_tmp 9790 9791 # from column format 9792 column_formats = struct.get("from_column_format", []) 9793 temporary_tables_tmp, annotation_fields_tmp = ( 9794 self.create_transcript_view_from_column_format( 9795 transcripts_table=transcripts_table, 9796 column_formats=column_formats, 9797 temporary_tables=temporary_tables, 9798 annotation_fields=annotation_fields, 9799 ) 9800 ) 9801 temporary_tables += temporary_tables_tmp 9802 annotation_fields += annotation_fields_tmp 9803 9804 # Merge temporary tables query 9805 query_merge = "" 9806 for temporary_table in temporary_tables: 9807 9808 # First temporary table 9809 if not query_merge: 9810 query_merge = f""" 9811 SELECT * FROM {temporary_table} 9812 """ 9813 # other temporary table (using UNION) 9814 else: 9815 query_merge += f""" 9816 UNION BY NAME SELECT * FROM {temporary_table} 9817 """ 9818 9819 # Merge on transcript 9820 query_merge_on_transcripts_annotation_fields = [] 9821 # Aggregate all annotations fields 9822 for annotation_field in set(annotation_fields): 9823 query_merge_on_transcripts_annotation_fields.append( 9824 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 9825 ) 9826 # Query for transcripts view 9827 query_merge_on_transcripts = f""" 9828 SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 9829 FROM ({query_merge}) 9830 GROUP BY "#CHROM", POS, REF, ALT, transcript 9831 """ 9832 9833 # Drop transcript view is necessary 9834 if transcripts_table_drop: 9835 query_drop = f""" 9836 DROP TABLE IF EXISTS {transcripts_table}; 9837 """ 9838 self.execute_query(query=query_drop) 9839 9840 # Merge and create transcript view 9841 query_create_view = f""" 9842 CREATE TABLE IF NOT EXISTS {transcripts_table} 9843 AS {query_merge_on_transcripts} 9844 """ 9845 self.execute_query(query=query_create_view) 9846 9847 # Remove added columns 9848 for added_column in added_columns: 9849 self.drop_column(column=added_column) 9850 9851 else: 9852 9853 transcripts_table = None 9854 9855 return transcripts_table 9856 9857 def annotation_format_to_table( 9858 self, 9859 uniquify: bool = True, 9860 annotation_field: str = "ANN", 9861 annotation_id: str = "Feature_ID", 9862 view_name: str = "transcripts", 9863 ) -> str: 9864 """ 9865 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 9866 table format. 9867 9868 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 9869 values in the output or not. If set to `True`, the function will make sure that the output values 9870 are unique, defaults to True 9871 :type uniquify: bool (optional) 9872 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 9873 contains the annotation information for each variant. This field is used to extract the annotation 9874 details for further processing in the function, defaults to ANN 9875 :type annotation_field: str (optional) 9876 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 9877 used to specify the identifier for the annotation feature. This identifier will be used as a column 9878 name in the resulting table or view that is created based on the annotation data. It helps in 9879 uniquely identifying each annotation entry in the, defaults to Feature_ID 9880 :type annotation_id: str (optional) 9881 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 9882 specify the name of the temporary table that will be created to store the transformed annotation 9883 data. This table will hold the extracted information from the annotation field in a structured 9884 format for further processing or analysis, defaults to transcripts 9885 :type view_name: str (optional) 9886 :return: The function `annotation_format_to_table` is returning the name of the view created, which 9887 is stored in the variable `view_name`. 9888 """ 9889 9890 # Annotation field 9891 annotation_format = "annotation_explode" 9892 9893 # Transcript annotation 9894 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 9895 9896 # Prefix 9897 prefix = self.get_explode_infos_prefix() 9898 if prefix: 9899 prefix = "INFO/" 9900 9901 # Annotation fields 9902 annotation_infos = prefix + annotation_field 9903 annotation_format_infos = prefix + annotation_format 9904 9905 # Variants table 9906 table_variants = self.get_table_variants() 9907 9908 # Header 9909 vcf_reader = self.get_header() 9910 9911 # Add columns 9912 added_columns = [] 9913 9914 # Explode HGVS field in column 9915 added_columns += self.explode_infos(fields=[annotation_field]) 9916 9917 if annotation_field in vcf_reader.infos: 9918 9919 # Extract ANN header 9920 ann_description = vcf_reader.infos[annotation_field].desc 9921 pattern = r"'(.+?)'" 9922 match = re.search(pattern, ann_description) 9923 if match: 9924 ann_header_match = match.group(1).split(" | ") 9925 ann_header = [] 9926 ann_header_desc = {} 9927 for i in range(len(ann_header_match)): 9928 ann_header_info = "".join( 9929 char for char in ann_header_match[i] if char.isalnum() 9930 ) 9931 ann_header.append(ann_header_info) 9932 ann_header_desc[ann_header_info] = ann_header_match[i] 9933 if not ann_header_desc: 9934 raise ValueError("Invalid header description format") 9935 else: 9936 raise ValueError("Invalid header description format") 9937 9938 # Create variant id 9939 variant_id_column = self.get_variant_id_column() 9940 added_columns += [variant_id_column] 9941 9942 # Create dataframe 9943 dataframe_annotation_format = self.get_query_to_df( 9944 f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 9945 ) 9946 9947 # Create annotation columns 9948 dataframe_annotation_format[ 9949 annotation_format_infos 9950 ] = dataframe_annotation_format[annotation_infos].apply( 9951 lambda x: explode_annotation_format( 9952 annotation=str(x), 9953 uniquify=uniquify, 9954 output_format="JSON", 9955 prefix="", 9956 header=list(ann_header_desc.values()), 9957 ) 9958 ) 9959 9960 # Find keys 9961 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 9962 df_keys = self.get_query_to_df(query=query_json) 9963 9964 # Check keys 9965 query_json_key = [] 9966 for _, row in df_keys.iterrows(): 9967 9968 # Key 9969 key = row.iloc[0] 9970 9971 # key_clean 9972 key_clean = "".join(char for char in key if char.isalnum()) 9973 9974 # Type 9975 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 9976 9977 # Get DataFrame from query 9978 df_json_type = self.get_query_to_df(query=query_json_type) 9979 9980 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 9981 with pd.option_context("future.no_silent_downcasting", True): 9982 df_json_type.fillna(value="", inplace=True) 9983 replace_dict = {None: np.nan, "": np.nan} 9984 df_json_type.replace(replace_dict, inplace=True) 9985 df_json_type.dropna(inplace=True) 9986 9987 # Detect column type 9988 column_type = detect_column_type(df_json_type[key_clean]) 9989 9990 # Append 9991 query_json_key.append( 9992 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 9993 ) 9994 9995 # Create view 9996 query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));""" 9997 self.execute_query(query=query_view) 9998 9999 else: 10000 10001 # Return None 10002 view_name = None 10003 10004 # Remove added columns 10005 for added_column in added_columns: 10006 self.drop_column(column=added_column) 10007 10008 return view_name 10009 10010 def transcript_view_to_variants( 10011 self, 10012 transcripts_table: str = None, 10013 transcripts_column_id: str = None, 10014 transcripts_info_json: str = None, 10015 transcripts_info_field_json: str = None, 10016 transcripts_info_format: str = None, 10017 transcripts_info_field_format: str = None, 10018 param: dict = {}, 10019 ) -> bool: 10020 """ 10021 The `transcript_view_to_variants` function updates a variants table with information from 10022 transcripts in JSON format. 10023 10024 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 10025 table containing the transcripts data. If this parameter is not provided, the function will 10026 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 10027 :type transcripts_table: str 10028 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 10029 column in the `transcripts_table` that contains the unique identifier for each transcript. This 10030 identifier is used to match transcripts with variants in the database 10031 :type transcripts_column_id: str 10032 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 10033 of the column in the variants table where the transcripts information will be stored in JSON 10034 format. This parameter allows you to define the column in the variants table that will hold the 10035 JSON-formatted information about transcripts 10036 :type transcripts_info_json: str 10037 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 10038 specify the field in the VCF header that will contain information about transcripts in JSON 10039 format. This field will be added to the VCF header as an INFO field with the specified name 10040 :type transcripts_info_field_json: str 10041 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 10042 format of the information about transcripts that will be stored in the variants table. This 10043 format can be used to define how the transcript information will be structured or displayed 10044 within the variants table 10045 :type transcripts_info_format: str 10046 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 10047 specify the field in the VCF header that will contain information about transcripts in a 10048 specific format. This field will be added to the VCF header as an INFO field with the specified 10049 name 10050 :type transcripts_info_field_format: str 10051 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 10052 that contains various configuration settings related to transcripts. It is used to provide 10053 default values for certain parameters if they are not explicitly provided when calling the 10054 method. The `param` dictionary can be passed as an argument 10055 :type param: dict 10056 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 10057 if the operation is successful and `False` if certain conditions are not met. 10058 """ 10059 10060 msg_info_prefix = "Start transcripts view to variants annotations" 10061 10062 log.debug(f"{msg_info_prefix}...") 10063 10064 # Default 10065 transcripts_table_default = "transcripts" 10066 transcripts_column_id_default = "transcript" 10067 transcripts_info_json_default = None 10068 transcripts_info_format_default = None 10069 transcripts_info_field_json_default = None 10070 transcripts_info_field_format_default = None 10071 10072 # Param 10073 if not param: 10074 param = self.get_param() 10075 10076 # Transcripts table 10077 if transcripts_table is None: 10078 transcripts_table = param.get("transcripts", {}).get( 10079 "table", transcripts_table_default 10080 ) 10081 10082 # Transcripts column ID 10083 if transcripts_column_id is None: 10084 transcripts_column_id = param.get("transcripts", {}).get( 10085 "column_id", transcripts_column_id_default 10086 ) 10087 10088 # Transcripts info json 10089 if transcripts_info_json is None: 10090 transcripts_info_json = param.get("transcripts", {}).get( 10091 "transcripts_info_json", transcripts_info_json_default 10092 ) 10093 10094 # Transcripts info field JSON 10095 if transcripts_info_field_json is None: 10096 transcripts_info_field_json = param.get("transcripts", {}).get( 10097 "transcripts_info_field_json", transcripts_info_field_json_default 10098 ) 10099 # if transcripts_info_field_json is not None and transcripts_info_json is None: 10100 # transcripts_info_json = transcripts_info_field_json 10101 10102 # Transcripts info format 10103 if transcripts_info_format is None: 10104 transcripts_info_format = param.get("transcripts", {}).get( 10105 "transcripts_info_format", transcripts_info_format_default 10106 ) 10107 10108 # Transcripts info field FORMAT 10109 if transcripts_info_field_format is None: 10110 transcripts_info_field_format = param.get("transcripts", {}).get( 10111 "transcripts_info_field_format", transcripts_info_field_format_default 10112 ) 10113 # if ( 10114 # transcripts_info_field_format is not None 10115 # and transcripts_info_format is None 10116 # ): 10117 # transcripts_info_format = transcripts_info_field_format 10118 10119 # Variants table 10120 table_variants = self.get_table_variants() 10121 10122 # Check info columns param 10123 if ( 10124 transcripts_info_json is None 10125 and transcripts_info_field_json is None 10126 and transcripts_info_format is None 10127 and transcripts_info_field_format is None 10128 ): 10129 return False 10130 10131 # Transcripts infos columns 10132 query_transcripts_infos_columns = f""" 10133 SELECT * 10134 FROM ( 10135 DESCRIBE SELECT * FROM {transcripts_table} 10136 ) 10137 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10138 """ 10139 transcripts_infos_columns = list( 10140 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10141 ) 10142 10143 # View results 10144 clause_select = [] 10145 clause_to_json = [] 10146 clause_to_format = [] 10147 for field in transcripts_infos_columns: 10148 clause_select.append( 10149 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10150 ) 10151 clause_to_json.append(f""" '{field}': "{field}" """) 10152 clause_to_format.append(f""" "{field}" """) 10153 10154 # Update 10155 update_set_json = [] 10156 update_set_format = [] 10157 10158 # VCF header 10159 vcf_reader = self.get_header() 10160 10161 # Transcripts to info column in JSON 10162 if transcripts_info_json is not None: 10163 10164 # Create column on variants table 10165 self.add_column( 10166 table_name=table_variants, 10167 column_name=transcripts_info_json, 10168 column_type="JSON", 10169 default_value=None, 10170 drop=False, 10171 ) 10172 10173 # Add header 10174 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10175 transcripts_info_json, 10176 ".", 10177 "String", 10178 "Transcripts in JSON format", 10179 "unknwon", 10180 "unknwon", 10181 self.code_type_map["String"], 10182 ) 10183 10184 # Add to update 10185 update_set_json.append( 10186 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10187 ) 10188 10189 # Transcripts to info field in JSON 10190 if transcripts_info_field_json is not None: 10191 10192 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 10193 10194 # Add to update 10195 update_set_json.append( 10196 f""" 10197 INFO = concat( 10198 CASE 10199 WHEN INFO NOT IN ('', '.') 10200 THEN INFO 10201 ELSE '' 10202 END, 10203 CASE 10204 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10205 THEN concat( 10206 ';{transcripts_info_field_json}=', 10207 t.{transcripts_info_json} 10208 ) 10209 ELSE '' 10210 END 10211 ) 10212 """ 10213 ) 10214 10215 # Add header 10216 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 10217 transcripts_info_field_json, 10218 ".", 10219 "String", 10220 "Transcripts in JSON format", 10221 "unknwon", 10222 "unknwon", 10223 self.code_type_map["String"], 10224 ) 10225 10226 if update_set_json: 10227 10228 # Update query 10229 query_update = f""" 10230 UPDATE {table_variants} 10231 SET {", ".join(update_set_json)} 10232 FROM 10233 ( 10234 SELECT 10235 "#CHROM", POS, REF, ALT, 10236 concat( 10237 '{{', 10238 string_agg( 10239 '"' || "{transcripts_column_id}" || '":' || 10240 to_json(json_output) 10241 ), 10242 '}}' 10243 )::JSON AS {transcripts_info_json} 10244 FROM 10245 ( 10246 SELECT 10247 "#CHROM", POS, REF, ALT, 10248 "{transcripts_column_id}", 10249 to_json( 10250 {{{",".join(clause_to_json)}}} 10251 )::JSON AS json_output 10252 FROM 10253 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10254 WHERE "{transcripts_column_id}" IS NOT NULL 10255 ) 10256 GROUP BY "#CHROM", POS, REF, ALT 10257 ) AS t 10258 WHERE {table_variants}."#CHROM" = t."#CHROM" 10259 AND {table_variants}."POS" = t."POS" 10260 AND {table_variants}."REF" = t."REF" 10261 AND {table_variants}."ALT" = t."ALT" 10262 """ 10263 10264 self.execute_query(query=query_update) 10265 10266 # Transcripts to info column in FORMAT 10267 if transcripts_info_format is not None: 10268 10269 # Create column on variants table 10270 self.add_column( 10271 table_name=table_variants, 10272 column_name=transcripts_info_format, 10273 column_type="VARCHAR", 10274 default_value=None, 10275 drop=False, 10276 ) 10277 10278 # Add header 10279 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 10280 transcripts_info_format, 10281 ".", 10282 "String", 10283 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10284 "unknwon", 10285 "unknwon", 10286 self.code_type_map["String"], 10287 ) 10288 10289 # Add to update 10290 update_set_format.append( 10291 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 10292 ) 10293 10294 # Transcripts to info field in JSON 10295 if transcripts_info_field_format is not None: 10296 10297 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 10298 10299 # Add to update 10300 update_set_format.append( 10301 f""" 10302 INFO = concat( 10303 CASE 10304 WHEN INFO NOT IN ('', '.') 10305 THEN INFO 10306 ELSE '' 10307 END, 10308 CASE 10309 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 10310 THEN concat( 10311 ';{transcripts_info_field_format}=', 10312 t.{transcripts_info_format} 10313 ) 10314 ELSE '' 10315 END 10316 ) 10317 """ 10318 ) 10319 10320 # Add header 10321 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 10322 transcripts_info_field_format, 10323 ".", 10324 "String", 10325 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10326 "unknwon", 10327 "unknwon", 10328 self.code_type_map["String"], 10329 ) 10330 10331 if update_set_format: 10332 10333 # Update query 10334 query_update = f""" 10335 UPDATE {table_variants} 10336 SET {", ".join(update_set_format)} 10337 FROM 10338 ( 10339 SELECT 10340 "#CHROM", POS, REF, ALT, 10341 string_agg({transcripts_info_format}) AS {transcripts_info_format} 10342 FROM 10343 ( 10344 SELECT 10345 "#CHROM", POS, REF, ALT, 10346 "{transcripts_column_id}", 10347 concat( 10348 "{transcripts_column_id}", 10349 '|', 10350 {", '|', ".join(clause_to_format)} 10351 ) AS {transcripts_info_format} 10352 FROM 10353 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10354 ) 10355 GROUP BY "#CHROM", POS, REF, ALT 10356 ) AS t 10357 WHERE {table_variants}."#CHROM" = t."#CHROM" 10358 AND {table_variants}."POS" = t."POS" 10359 AND {table_variants}."REF" = t."REF" 10360 AND {table_variants}."ALT" = t."ALT" 10361 """ 10362 10363 self.execute_query(query=query_update) 10364 10365 return True
34class Variants: 35 36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Load data 78 if load: 79 self.load_data() 80 81 def set_input(self, input: str = None) -> None: 82 """ 83 The function `set_input` takes a file name as input, extracts the name and extension, and sets 84 attributes in the class accordingly. 85 86 :param input: The `set_input` method in the provided code snippet is used to set attributes 87 related to the input file. Here's a breakdown of the parameters and their usage in the method: 88 :type input: str 89 """ 90 91 if input and not isinstance(input, str): 92 try: 93 self.input = input.name 94 except: 95 log.error(f"Input file '{input} in bad format") 96 raise ValueError(f"Input file '{input} in bad format") 97 else: 98 self.input = input 99 100 # Input format 101 if input: 102 input_name, input_extension = os.path.splitext(self.input) 103 self.input_name = input_name 104 self.input_extension = input_extension 105 self.input_format = self.input_extension.replace(".", "") 106 107 def set_config(self, config: dict) -> None: 108 """ 109 The set_config function takes a config object and assigns it as the configuration object for the 110 class. 111 112 :param config: The `config` parameter in the `set_config` function is a dictionary object that 113 contains configuration settings for the class. When you call the `set_config` function with a 114 dictionary object as the argument, it will set that dictionary as the configuration object for 115 the class 116 :type config: dict 117 """ 118 119 self.config = config 120 121 def set_param(self, param: dict) -> None: 122 """ 123 This function sets a parameter object for the class based on the input dictionary. 124 125 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 126 as the `param` attribute of the class instance 127 :type param: dict 128 """ 129 130 self.param = param 131 132 def init_variables(self) -> None: 133 """ 134 This function initializes the variables that will be used in the rest of the class 135 """ 136 137 self.prefix = "howard" 138 self.table_variants = "variants" 139 self.dataframe = None 140 141 self.comparison_map = { 142 "gt": ">", 143 "gte": ">=", 144 "lt": "<", 145 "lte": "<=", 146 "equals": "=", 147 "contains": "SIMILAR TO", 148 } 149 150 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 151 152 self.code_type_map_to_sql = { 153 "Integer": "INTEGER", 154 "String": "VARCHAR", 155 "Float": "FLOAT", 156 "Flag": "VARCHAR", 157 } 158 159 self.index_additionnal_fields = [] 160 161 def get_indexing(self) -> bool: 162 """ 163 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 164 returns False. 165 :return: The value of the indexing parameter. 166 """ 167 168 return self.get_param().get("indexing", False) 169 170 def get_connexion_config(self) -> dict: 171 """ 172 The function `get_connexion_config` returns a dictionary containing the configuration for a 173 connection, including the number of threads and memory limit. 174 :return: a dictionary containing the configuration for the Connexion library. 175 """ 176 177 # config 178 config = self.get_config() 179 180 # Connexion config 181 connexion_config = {} 182 threads = self.get_threads() 183 184 # Threads 185 if threads: 186 connexion_config["threads"] = threads 187 188 # Memory 189 # if config.get("memory", None): 190 # connexion_config["memory_limit"] = config.get("memory") 191 if self.get_memory(): 192 connexion_config["memory_limit"] = self.get_memory() 193 194 # Temporary directory 195 if config.get("tmp", None): 196 connexion_config["temp_directory"] = config.get("tmp") 197 198 # Access 199 if config.get("access", None): 200 access = config.get("access") 201 if access in ["RO"]: 202 access = "READ_ONLY" 203 elif access in ["RW"]: 204 access = "READ_WRITE" 205 connexion_db = self.get_connexion_db() 206 if connexion_db in ":memory:": 207 access = "READ_WRITE" 208 connexion_config["access_mode"] = access 209 210 return connexion_config 211 212 def get_duckdb_settings(self) -> dict: 213 """ 214 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 215 string. 216 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 217 """ 218 219 # config 220 config = self.get_config() 221 222 # duckdb settings 223 duckdb_settings_dict = {} 224 if config.get("duckdb_settings", None): 225 duckdb_settings = config.get("duckdb_settings") 226 duckdb_settings = full_path(duckdb_settings) 227 # duckdb setting is a file 228 if os.path.exists(duckdb_settings): 229 with open(duckdb_settings) as json_file: 230 duckdb_settings_dict = yaml.safe_load(json_file) 231 # duckdb settings is a string 232 else: 233 duckdb_settings_dict = json.loads(duckdb_settings) 234 235 return duckdb_settings_dict 236 237 def set_connexion_db(self) -> str: 238 """ 239 The function `set_connexion_db` returns the appropriate database connection string based on the 240 input format and connection type. 241 :return: the value of the variable `connexion_db`. 242 """ 243 244 # Default connexion db 245 default_connexion_db = ":memory:" 246 247 # Find connexion db 248 if self.get_input_format() in ["db", "duckdb"]: 249 connexion_db = self.get_input() 250 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 251 connexion_db = default_connexion_db 252 elif self.get_connexion_type() in ["tmpfile"]: 253 tmp_name = tempfile.mkdtemp( 254 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 255 ) 256 connexion_db = f"{tmp_name}/tmp.db" 257 elif self.get_connexion_type() != "": 258 connexion_db = self.get_connexion_type() 259 else: 260 connexion_db = default_connexion_db 261 262 # Set connexion db 263 self.connexion_db = connexion_db 264 265 return connexion_db 266 267 def set_connexion(self, conn) -> None: 268 """ 269 The function `set_connexion` creates a connection to a database, with options for different 270 database formats and settings. 271 272 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 273 database. If a connection is not provided, a new connection to an in-memory database is created. 274 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 275 sqlite 276 """ 277 278 # Connexion db 279 connexion_db = self.set_connexion_db() 280 281 # Connexion config 282 connexion_config = self.get_connexion_config() 283 284 # Connexion format 285 connexion_format = self.get_config().get("connexion_format", "duckdb") 286 # Set connexion format 287 self.connexion_format = connexion_format 288 289 # Connexion 290 if not conn: 291 if connexion_format in ["duckdb"]: 292 conn = duckdb.connect(connexion_db, config=connexion_config) 293 # duckDB settings 294 duckdb_settings = self.get_duckdb_settings() 295 if duckdb_settings: 296 for setting in duckdb_settings: 297 setting_value = duckdb_settings.get(setting) 298 if isinstance(setting_value, str): 299 setting_value = f"'{setting_value}'" 300 conn.execute(f"PRAGMA {setting}={setting_value};") 301 elif connexion_format in ["sqlite"]: 302 conn = sqlite3.connect(connexion_db) 303 304 # Set connexion 305 self.conn = conn 306 307 # Log 308 log.debug(f"connexion_format: {connexion_format}") 309 log.debug(f"connexion_db: {connexion_db}") 310 log.debug(f"connexion config: {connexion_config}") 311 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 312 313 def set_output(self, output: str = None) -> None: 314 """ 315 The `set_output` function in Python sets the output file based on the input or a specified key 316 in the config file, extracting the output name, extension, and format. 317 318 :param output: The `output` parameter in the `set_output` method is used to specify the name of 319 the output file. If the config file has an 'output' key, the method sets the output to the value 320 of that key. If no output is provided, it sets the output to `None` 321 :type output: str 322 """ 323 324 if output and not isinstance(output, str): 325 self.output = output.name 326 else: 327 self.output = output 328 329 # Output format 330 if self.output: 331 output_name, output_extension = os.path.splitext(self.output) 332 self.output_name = output_name 333 self.output_extension = output_extension 334 self.output_format = self.output_extension.replace(".", "") 335 else: 336 self.output_name = None 337 self.output_extension = None 338 self.output_format = None 339 340 def set_header(self) -> None: 341 """ 342 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 343 """ 344 345 input_file = self.get_input() 346 default_header_list = [ 347 "##fileformat=VCFv4.2", 348 "#CHROM POS ID REF ALT QUAL FILTER INFO", 349 ] 350 351 # Full path 352 input_file = full_path(input_file) 353 354 if input_file: 355 356 input_format = self.get_input_format() 357 input_compressed = self.get_input_compressed() 358 config = self.get_config() 359 header_list = default_header_list 360 if input_format in [ 361 "vcf", 362 "hdr", 363 "tsv", 364 "csv", 365 "psv", 366 "parquet", 367 "db", 368 "duckdb", 369 ]: 370 # header provided in param 371 if config.get("header_file", None): 372 with open(config.get("header_file"), "rt") as f: 373 header_list = self.read_vcf_header(f) 374 # within a vcf file format (header within input file itsself) 375 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 376 # within a compressed vcf file format (.vcf.gz) 377 if input_compressed: 378 with bgzf.open(input_file, "rt") as f: 379 header_list = self.read_vcf_header(f) 380 # within an uncompressed vcf file format (.vcf) 381 else: 382 with open(input_file, "rt") as f: 383 header_list = self.read_vcf_header(f) 384 # header provided in default external file .hdr 385 elif os.path.exists((input_file + ".hdr")): 386 with open(input_file + ".hdr", "rt") as f: 387 header_list = self.read_vcf_header(f) 388 else: 389 try: # Try to get header info fields and file columns 390 391 with tempfile.TemporaryDirectory() as tmpdir: 392 393 # Create database 394 db_for_header = Database(database=input_file) 395 396 # Get header columns for infos fields 397 db_header_from_columns = ( 398 db_for_header.get_header_from_columns() 399 ) 400 401 # Get real columns in the file 402 db_header_columns = db_for_header.get_columns() 403 404 # Write header file 405 header_file_tmp = os.path.join(tmpdir, "header") 406 f = open(header_file_tmp, "w") 407 vcf.Writer(f, db_header_from_columns) 408 f.close() 409 410 # Replace #CHROM line with rel columns 411 header_list = db_for_header.read_header_file( 412 header_file=header_file_tmp 413 ) 414 header_list[-1] = "\t".join(db_header_columns) 415 416 except: 417 418 log.warning( 419 f"No header for file {input_file}. Set as default VCF header" 420 ) 421 header_list = default_header_list 422 423 else: # try for unknown format ? 424 425 log.error(f"Input file format '{input_format}' not available") 426 raise ValueError(f"Input file format '{input_format}' not available") 427 428 if not header_list: 429 header_list = default_header_list 430 431 # header as list 432 self.header_list = header_list 433 434 # header as VCF object 435 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 436 437 else: 438 439 self.header_list = None 440 self.header_vcf = None 441 442 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 443 """ 444 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 445 DataFrame based on the connection format. 446 447 :param query: The `query` parameter in the `get_query_to_df` function is a string that 448 represents the SQL query you want to execute. This query will be used to fetch data from a 449 database and convert it into a pandas DataFrame 450 :type query: str 451 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 452 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 453 function will only fetch up to that number of rows from the database query result. If no limit 454 is specified, 455 :type limit: int 456 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 457 """ 458 459 # Connexion format 460 connexion_format = self.get_connexion_format() 461 462 # Limit in query 463 if limit: 464 pd.set_option("display.max_rows", limit) 465 if connexion_format in ["duckdb"]: 466 df = ( 467 self.conn.execute(query) 468 .fetch_record_batch(limit) 469 .read_next_batch() 470 .to_pandas() 471 ) 472 elif connexion_format in ["sqlite"]: 473 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 474 475 # Full query 476 else: 477 if connexion_format in ["duckdb"]: 478 df = self.conn.execute(query).df() 479 elif connexion_format in ["sqlite"]: 480 df = pd.read_sql_query(query, self.conn) 481 482 return df 483 484 def get_overview(self) -> None: 485 """ 486 The function prints the input, output, config, and dataframe of the current object 487 """ 488 table_variants_from = self.get_table_variants(clause="from") 489 sql_columns = self.get_header_columns_as_sql() 490 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 491 df = self.get_query_to_df(sql_query_export) 492 log.info( 493 "Input: " 494 + str(self.get_input()) 495 + " [" 496 + str(str(self.get_input_format())) 497 + "]" 498 ) 499 log.info( 500 "Output: " 501 + str(self.get_output()) 502 + " [" 503 + str(str(self.get_output_format())) 504 + "]" 505 ) 506 log.info("Config: ") 507 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 508 "\n" 509 ): 510 log.info("\t" + str(d)) 511 log.info("Param: ") 512 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 513 "\n" 514 ): 515 log.info("\t" + str(d)) 516 log.info("Sample list: " + str(self.get_header_sample_list())) 517 log.info("Dataframe: ") 518 for d in str(df).split("\n"): 519 log.info("\t" + str(d)) 520 521 # garbage collector 522 del df 523 gc.collect() 524 525 return None 526 527 def get_stats(self) -> dict: 528 """ 529 The `get_stats` function calculates and returns various statistics of the current object, 530 including information about the input file, variants, samples, header fields, quality, and 531 SNVs/InDels. 532 :return: a dictionary containing various statistics of the current object. The dictionary has 533 the following structure: 534 """ 535 536 # Log 537 log.info(f"Stats Calculation...") 538 539 # table varaints 540 table_variants_from = self.get_table_variants() 541 542 # stats dict 543 stats = {"Infos": {}} 544 545 ### File 546 input_file = self.get_input() 547 stats["Infos"]["Input file"] = input_file 548 549 # Header 550 header_infos = self.get_header().infos 551 header_formats = self.get_header().formats 552 header_infos_list = list(header_infos) 553 header_formats_list = list(header_formats) 554 555 ### Variants 556 557 stats["Variants"] = {} 558 559 # Variants by chr 560 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 561 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 562 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 563 by=["CHROM"], kind="quicksort" 564 ) 565 566 # Total number of variants 567 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 568 569 # Calculate percentage 570 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 571 lambda x: (x / nb_of_variants) 572 ) 573 574 stats["Variants"]["Number of variants by chromosome"] = ( 575 nb_of_variants_by_chrom.to_dict(orient="index") 576 ) 577 578 stats["Infos"]["Number of variants"] = int(nb_of_variants) 579 580 ### Samples 581 582 # Init 583 samples = {} 584 nb_of_samples = 0 585 586 # Check Samples 587 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 588 log.debug(f"Check samples...") 589 for sample in self.get_header_sample_list(): 590 sql_query_samples = f""" 591 SELECT '{sample}' as sample, 592 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 593 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 594 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 595 FROM {table_variants_from} 596 WHERE ( 597 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 598 AND 599 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 600 ) 601 GROUP BY genotype 602 """ 603 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 604 sample_genotype_count = sql_query_genotype_df["count"].sum() 605 if len(sql_query_genotype_df): 606 nb_of_samples += 1 607 samples[f"{sample} - {sample_genotype_count} variants"] = ( 608 sql_query_genotype_df.to_dict(orient="index") 609 ) 610 611 stats["Samples"] = samples 612 stats["Infos"]["Number of samples"] = nb_of_samples 613 614 # # 615 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 616 # stats["Infos"]["Number of samples"] = nb_of_samples 617 # elif nb_of_samples: 618 # stats["Infos"]["Number of samples"] = "not a VCF format" 619 620 ### INFO and FORMAT fields 621 header_types_df = {} 622 header_types_list = { 623 "List of INFO fields": header_infos, 624 "List of FORMAT fields": header_formats, 625 } 626 i = 0 627 for header_type in header_types_list: 628 629 header_type_infos = header_types_list.get(header_type) 630 header_infos_dict = {} 631 632 for info in header_type_infos: 633 634 i += 1 635 header_infos_dict[i] = {} 636 637 # ID 638 header_infos_dict[i]["id"] = info 639 640 # num 641 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 642 if header_type_infos[info].num in genotype_map.keys(): 643 header_infos_dict[i]["Number"] = genotype_map.get( 644 header_type_infos[info].num 645 ) 646 else: 647 header_infos_dict[i]["Number"] = header_type_infos[info].num 648 649 # type 650 if header_type_infos[info].type: 651 header_infos_dict[i]["Type"] = header_type_infos[info].type 652 else: 653 header_infos_dict[i]["Type"] = "." 654 655 # desc 656 if header_type_infos[info].desc != None: 657 header_infos_dict[i]["Description"] = header_type_infos[info].desc 658 else: 659 header_infos_dict[i]["Description"] = "" 660 661 if len(header_infos_dict): 662 header_types_df[header_type] = pd.DataFrame.from_dict( 663 header_infos_dict, orient="index" 664 ).to_dict(orient="index") 665 666 # Stats 667 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 668 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 669 stats["Header"] = header_types_df 670 671 ### QUAL 672 if "QUAL" in self.get_header_columns(): 673 sql_query_qual = f""" 674 SELECT 675 avg(CAST(QUAL AS INTEGER)) AS Average, 676 min(CAST(QUAL AS INTEGER)) AS Minimum, 677 max(CAST(QUAL AS INTEGER)) AS Maximum, 678 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 679 median(CAST(QUAL AS INTEGER)) AS Median, 680 variance(CAST(QUAL AS INTEGER)) AS Variance 681 FROM {table_variants_from} 682 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 683 """ 684 685 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 686 stats["Quality"] = {"Stats": qual} 687 688 ### SNV and InDel 689 690 sql_query_snv = f""" 691 692 SELECT Type, count FROM ( 693 694 SELECT 695 'Total' AS Type, 696 count(*) AS count 697 FROM {table_variants_from} 698 699 UNION 700 701 SELECT 702 'MNV' AS Type, 703 count(*) AS count 704 FROM {table_variants_from} 705 WHERE len(REF) > 1 AND len(ALT) > 1 706 AND len(REF) = len(ALT) 707 708 UNION 709 710 SELECT 711 'InDel' AS Type, 712 count(*) AS count 713 FROM {table_variants_from} 714 WHERE len(REF) > 1 OR len(ALT) > 1 715 AND len(REF) != len(ALT) 716 717 UNION 718 719 SELECT 720 'SNV' AS Type, 721 count(*) AS count 722 FROM {table_variants_from} 723 WHERE len(REF) = 1 AND len(ALT) = 1 724 725 ) 726 727 ORDER BY count DESC 728 729 """ 730 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 731 732 sql_query_snv_substitution = f""" 733 SELECT 734 concat(REF, '>', ALT) AS 'Substitution', 735 count(*) AS count 736 FROM {table_variants_from} 737 WHERE len(REF) = 1 AND len(ALT) = 1 738 GROUP BY REF, ALT 739 ORDER BY count(*) DESC 740 """ 741 snv_substitution = ( 742 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 743 ) 744 stats["Variants"]["Counts"] = snv_indel 745 stats["Variants"]["Substitutions"] = snv_substitution 746 747 return stats 748 749 def stats_to_file(self, file: str = None) -> str: 750 """ 751 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 752 into a JSON object, and writes the JSON object to the specified file. 753 754 :param file: The `file` parameter is a string that represents the file path where the JSON data 755 will be written 756 :type file: str 757 :return: the name of the file that was written to. 758 """ 759 760 # Get stats 761 stats = self.get_stats() 762 763 # Serializing json 764 json_object = json.dumps(stats, indent=4) 765 766 # Writing to sample.json 767 with open(file, "w") as outfile: 768 outfile.write(json_object) 769 770 return file 771 772 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 773 """ 774 The `print_stats` function generates a markdown file and prints the statistics contained in a 775 JSON file in a formatted manner. 776 777 :param output_file: The `output_file` parameter is a string that specifies the path and filename 778 of the output file where the stats will be printed in Markdown format. If no `output_file` is 779 provided, a temporary directory will be created and the stats will be saved in a file named 780 "stats.md" within that 781 :type output_file: str 782 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 783 file where the statistics will be saved. If no value is provided, a temporary directory will be 784 created and a default file name "stats.json" will be used 785 :type json_file: str 786 :return: The function `print_stats` does not return any value. It has a return type annotation 787 of `None`. 788 """ 789 790 # Full path 791 output_file = full_path(output_file) 792 json_file = full_path(json_file) 793 794 with tempfile.TemporaryDirectory() as tmpdir: 795 796 # Files 797 if not output_file: 798 output_file = os.path.join(tmpdir, "stats.md") 799 if not json_file: 800 json_file = os.path.join(tmpdir, "stats.json") 801 802 # Create folders 803 if not os.path.exists(os.path.dirname(output_file)): 804 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 805 if not os.path.exists(os.path.dirname(json_file)): 806 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 807 808 # Create stats JSON file 809 stats_file = self.stats_to_file(file=json_file) 810 811 # Print stats file 812 with open(stats_file) as f: 813 stats = yaml.safe_load(f) 814 815 # Output 816 output_title = [] 817 output_index = [] 818 output = [] 819 820 # Title 821 output_title.append("# HOWARD Stats") 822 823 # Index 824 output_index.append("## Index") 825 826 # Process sections 827 for section in stats: 828 infos = stats.get(section) 829 section_link = "#" + section.lower().replace(" ", "-") 830 output.append(f"## {section}") 831 output_index.append(f"- [{section}]({section_link})") 832 833 if len(infos): 834 for info in infos: 835 try: 836 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 837 is_df = True 838 except: 839 try: 840 df = pd.DataFrame.from_dict( 841 json.loads((infos.get(info))), orient="index" 842 ) 843 is_df = True 844 except: 845 is_df = False 846 if is_df: 847 output.append(f"### {info}") 848 info_link = "#" + info.lower().replace(" ", "-") 849 output_index.append(f" - [{info}]({info_link})") 850 output.append(f"{df.to_markdown(index=False)}") 851 else: 852 output.append(f"- {info}: {infos.get(info)}") 853 else: 854 output.append(f"NA") 855 856 # Write stats in markdown file 857 with open(output_file, "w") as fp: 858 for item in output_title: 859 fp.write("%s\n" % item) 860 for item in output_index: 861 fp.write("%s\n" % item) 862 for item in output: 863 fp.write("%s\n" % item) 864 865 # Output stats in markdown 866 print("") 867 print("\n\n".join(output_title)) 868 print("") 869 print("\n\n".join(output)) 870 print("") 871 872 return None 873 874 def get_input(self) -> str: 875 """ 876 It returns the value of the input variable. 877 :return: The input is being returned. 878 """ 879 return self.input 880 881 def get_input_format(self, input_file: str = None) -> str: 882 """ 883 This function returns the format of the input variable, either from the provided input file or 884 by prompting for input. 885 886 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 887 represents the file path of the input file. If no `input_file` is provided when calling the 888 method, it will default to `None` 889 :type input_file: str 890 :return: The format of the input variable is being returned. 891 """ 892 893 if not input_file: 894 input_file = self.get_input() 895 input_format = get_file_format(input_file) 896 return input_format 897 898 def get_input_compressed(self, input_file: str = None) -> str: 899 """ 900 The function `get_input_compressed` returns the format of the input variable after compressing 901 it. 902 903 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 904 that represents the file path of the input file. If no `input_file` is provided when calling the 905 method, it will default to `None` and the method will then call `self.get_input()` to 906 :type input_file: str 907 :return: The function `get_input_compressed` returns the compressed format of the input 908 variable. 909 """ 910 911 if not input_file: 912 input_file = self.get_input() 913 input_compressed = get_file_compressed(input_file) 914 return input_compressed 915 916 def get_output(self) -> str: 917 """ 918 It returns the output of the neuron. 919 :return: The output of the neural network. 920 """ 921 922 return self.output 923 924 def get_output_format(self, output_file: str = None) -> str: 925 """ 926 The function `get_output_format` returns the format of the input variable or the output file if 927 provided. 928 929 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 930 that represents the file path of the output file. If no `output_file` is provided when calling 931 the method, it will default to the output obtained from the `get_output` method of the class 932 instance. The 933 :type output_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not output_file: 938 output_file = self.get_output() 939 output_format = get_file_format(output_file) 940 941 return output_format 942 943 def get_config(self) -> dict: 944 """ 945 It returns the config 946 :return: The config variable is being returned. 947 """ 948 return self.config 949 950 def get_param(self) -> dict: 951 """ 952 It returns the param 953 :return: The param variable is being returned. 954 """ 955 return self.param 956 957 def get_connexion_db(self) -> str: 958 """ 959 It returns the connexion_db attribute of the object 960 :return: The connexion_db is being returned. 961 """ 962 return self.connexion_db 963 964 def get_prefix(self) -> str: 965 """ 966 It returns the prefix of the object. 967 :return: The prefix is being returned. 968 """ 969 return self.prefix 970 971 def get_table_variants(self, clause: str = "select") -> str: 972 """ 973 This function returns the table_variants attribute of the object 974 975 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 976 defaults to select (optional) 977 :return: The table_variants attribute of the object. 978 """ 979 980 # Access 981 access = self.get_config().get("access", None) 982 983 # Clauses "select", "where", "update" 984 if clause in ["select", "where", "update"]: 985 table_variants = self.table_variants 986 # Clause "from" 987 elif clause in ["from"]: 988 # For Read Only 989 if self.get_input_format() in ["parquet"] and access in ["RO"]: 990 input_file = self.get_input() 991 table_variants = f"'{input_file}' as variants" 992 # For Read Write 993 else: 994 table_variants = f"{self.table_variants} as variants" 995 else: 996 table_variants = self.table_variants 997 return table_variants 998 999 def get_tmp_dir(self) -> str: 1000 """ 1001 The function `get_tmp_dir` returns the temporary directory path based on configuration 1002 parameters or a default path. 1003 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1004 configuration, parameters, and a default value of "/tmp". 1005 """ 1006 1007 return get_tmp( 1008 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1009 ) 1010 1011 def get_connexion_type(self) -> str: 1012 """ 1013 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1014 1015 :return: The connexion type is being returned. 1016 """ 1017 return self.get_config().get("connexion_type", "memory") 1018 1019 def get_connexion(self): 1020 """ 1021 It returns the connection object 1022 1023 :return: The connection object. 1024 """ 1025 return self.conn 1026 1027 def close_connexion(self) -> None: 1028 """ 1029 This function closes the connection to the database. 1030 :return: The connection is being closed. 1031 """ 1032 return self.conn.close() 1033 1034 def get_header(self, type: str = "vcf"): 1035 """ 1036 This function returns the header of the VCF file as a list of strings 1037 1038 :param type: the type of header you want to get, defaults to vcf (optional) 1039 :return: The header of the vcf file. 1040 """ 1041 1042 if self.header_vcf: 1043 if type == "vcf": 1044 return self.header_vcf 1045 elif type == "list": 1046 return self.header_list 1047 else: 1048 if type == "vcf": 1049 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1050 return header 1051 elif type == "list": 1052 return vcf_required 1053 1054 def get_header_length(self, file: str = None) -> int: 1055 """ 1056 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1057 line. 1058 1059 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1060 header file. If this argument is provided, the function will read the header from the specified 1061 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1062 :type file: str 1063 :return: the length of the header list, excluding the #CHROM line. 1064 """ 1065 1066 if file: 1067 return len(self.read_vcf_header_file(file=file)) - 1 1068 elif self.get_header(type="list"): 1069 return len(self.get_header(type="list")) - 1 1070 else: 1071 return 0 1072 1073 def get_header_columns(self) -> str: 1074 """ 1075 This function returns the header list of a VCF 1076 1077 :return: The length of the header list. 1078 """ 1079 if self.get_header(): 1080 return self.get_header(type="list")[-1] 1081 else: 1082 return "" 1083 1084 def get_header_columns_as_list(self) -> list: 1085 """ 1086 This function returns the header list of a VCF 1087 1088 :return: The length of the header list. 1089 """ 1090 if self.get_header(): 1091 return self.get_header_columns().strip().split("\t") 1092 else: 1093 return [] 1094 1095 def get_header_columns_as_sql(self) -> str: 1096 """ 1097 This function retruns header length (without #CHROM line) 1098 1099 :return: The length of the header list. 1100 """ 1101 sql_column_list = [] 1102 for col in self.get_header_columns_as_list(): 1103 sql_column_list.append(f'"{col}"') 1104 return ",".join(sql_column_list) 1105 1106 def get_header_sample_list(self) -> list: 1107 """ 1108 This function retruns header length (without #CHROM line) 1109 1110 :return: The length of the header list. 1111 """ 1112 return self.header_vcf.samples 1113 1114 def get_verbose(self) -> bool: 1115 """ 1116 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1117 exist 1118 1119 :return: The value of the key "verbose" in the config dictionary. 1120 """ 1121 return self.get_config().get("verbose", False) 1122 1123 def get_connexion_format(self) -> str: 1124 """ 1125 It returns the connexion format of the object. 1126 :return: The connexion_format is being returned. 1127 """ 1128 connexion_format = self.connexion_format 1129 if connexion_format not in ["duckdb", "sqlite"]: 1130 log.error(f"Unknown connexion format {connexion_format}") 1131 raise ValueError(f"Unknown connexion format {connexion_format}") 1132 else: 1133 return connexion_format 1134 1135 def insert_file_to_table( 1136 self, 1137 file, 1138 columns: str, 1139 header_len: int = 0, 1140 sep: str = "\t", 1141 chunksize: int = 1000000, 1142 ) -> None: 1143 """ 1144 The function reads a file in chunks and inserts each chunk into a table based on the specified 1145 database format. 1146 1147 :param file: The `file` parameter is the file that you want to load into a table. It should be 1148 the path to the file on your system 1149 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1150 should contain the names of the columns in the table where the data will be inserted. The column 1151 names should be separated by commas within the string. For example, if you have columns named 1152 "id", "name 1153 :type columns: str 1154 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1155 the number of lines to skip at the beginning of the file before reading the actual data. This 1156 parameter allows you to skip any header information present in the file before processing the 1157 data, defaults to 0 1158 :type header_len: int (optional) 1159 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1160 separator character that is used in the file being read. In this case, the default separator is 1161 set to `\t`, which represents a tab character. You can change this parameter to a different 1162 separator character if, defaults to \t 1163 :type sep: str (optional) 1164 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1165 when processing the file in chunks. In the provided code snippet, the default value for 1166 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1167 to 1000000 1168 :type chunksize: int (optional) 1169 """ 1170 1171 # Config 1172 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1173 connexion_format = self.get_connexion_format() 1174 1175 log.debug("chunksize: " + str(chunksize)) 1176 1177 if chunksize: 1178 for chunk in pd.read_csv( 1179 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1180 ): 1181 if connexion_format in ["duckdb"]: 1182 sql_insert_into = ( 1183 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1184 ) 1185 self.conn.execute(sql_insert_into) 1186 elif connexion_format in ["sqlite"]: 1187 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1188 1189 def load_data( 1190 self, 1191 input_file: str = None, 1192 drop_variants_table: bool = False, 1193 sample_size: int = 20480, 1194 ) -> None: 1195 """ 1196 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1197 table before loading the data and specify a sample size. 1198 1199 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1200 table 1201 :type input_file: str 1202 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1203 determines whether the variants table should be dropped before loading the data. If set to 1204 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1205 not be dropped, defaults to False 1206 :type drop_variants_table: bool (optional) 1207 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1208 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1209 20480 1210 :type sample_size: int (optional) 1211 """ 1212 1213 log.info("Loading...") 1214 1215 # change input file 1216 if input_file: 1217 self.set_input(input_file) 1218 self.set_header() 1219 1220 # drop variants table 1221 if drop_variants_table: 1222 self.drop_variants_table() 1223 1224 # get table variants 1225 table_variants = self.get_table_variants() 1226 1227 # Access 1228 access = self.get_config().get("access", None) 1229 log.debug(f"access: {access}") 1230 1231 # Input format and compress 1232 input_format = self.get_input_format() 1233 input_compressed = self.get_input_compressed() 1234 log.debug(f"input_format: {input_format}") 1235 log.debug(f"input_compressed: {input_compressed}") 1236 1237 # input_compressed_format 1238 if input_compressed: 1239 input_compressed_format = "gzip" 1240 else: 1241 input_compressed_format = "none" 1242 log.debug(f"input_compressed_format: {input_compressed_format}") 1243 1244 # Connexion format 1245 connexion_format = self.get_connexion_format() 1246 1247 # Sample size 1248 if not sample_size: 1249 sample_size = -1 1250 log.debug(f"sample_size: {sample_size}") 1251 1252 # Load data 1253 log.debug(f"Load Data from {input_format}") 1254 1255 # DuckDB connexion 1256 if connexion_format in ["duckdb"]: 1257 1258 # Database already exists 1259 if self.input_format in ["db", "duckdb"]: 1260 1261 if connexion_format in ["duckdb"]: 1262 log.debug(f"Input file format '{self.input_format}' duckDB") 1263 else: 1264 log.error( 1265 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1266 ) 1267 raise ValueError( 1268 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1269 ) 1270 1271 # Load from existing database format 1272 else: 1273 1274 try: 1275 # Create Table or View 1276 database = Database(database=self.input) 1277 sql_from = database.get_sql_from(sample_size=sample_size) 1278 1279 if access in ["RO"]: 1280 sql_load = ( 1281 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1282 ) 1283 else: 1284 sql_load = ( 1285 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1286 ) 1287 self.conn.execute(sql_load) 1288 1289 except: 1290 # Format not available 1291 log.error(f"Input file format '{self.input_format}' not available") 1292 raise ValueError( 1293 f"Input file format '{self.input_format}' not available" 1294 ) 1295 1296 # SQLite connexion 1297 elif connexion_format in ["sqlite"] and input_format in [ 1298 "vcf", 1299 "tsv", 1300 "csv", 1301 "psv", 1302 ]: 1303 1304 # Main structure 1305 structure = { 1306 "#CHROM": "VARCHAR", 1307 "POS": "INTEGER", 1308 "ID": "VARCHAR", 1309 "REF": "VARCHAR", 1310 "ALT": "VARCHAR", 1311 "QUAL": "VARCHAR", 1312 "FILTER": "VARCHAR", 1313 "INFO": "VARCHAR", 1314 } 1315 1316 # Strcuture with samples 1317 structure_complete = structure 1318 if self.get_header_sample_list(): 1319 structure["FORMAT"] = "VARCHAR" 1320 for sample in self.get_header_sample_list(): 1321 structure_complete[sample] = "VARCHAR" 1322 1323 # Columns list for create and insert 1324 sql_create_table_columns = [] 1325 sql_create_table_columns_list = [] 1326 for column in structure_complete: 1327 column_type = structure_complete[column] 1328 sql_create_table_columns.append( 1329 f'"{column}" {column_type} default NULL' 1330 ) 1331 sql_create_table_columns_list.append(f'"{column}"') 1332 1333 # Create database 1334 log.debug(f"Create Table {table_variants}") 1335 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1336 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1337 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1338 self.conn.execute(sql_create_table) 1339 1340 # chunksize define length of file chunk load file 1341 chunksize = 100000 1342 1343 # delimiter 1344 delimiter = file_format_delimiters.get(input_format, "\t") 1345 1346 # Load the input file 1347 with open(self.input, "rt") as input_file: 1348 1349 # Use the appropriate file handler based on the input format 1350 if input_compressed: 1351 input_file = bgzf.open(self.input, "rt") 1352 if input_format in ["vcf"]: 1353 header_len = self.get_header_length() 1354 else: 1355 header_len = 0 1356 1357 # Insert the file contents into a table 1358 self.insert_file_to_table( 1359 input_file, 1360 columns=sql_create_table_columns_list_sql, 1361 header_len=header_len, 1362 sep=delimiter, 1363 chunksize=chunksize, 1364 ) 1365 1366 else: 1367 log.error( 1368 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1369 ) 1370 raise ValueError( 1371 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1372 ) 1373 1374 # Explode INFOS fields into table fields 1375 if self.get_explode_infos(): 1376 self.explode_infos( 1377 prefix=self.get_explode_infos_prefix(), 1378 fields=self.get_explode_infos_fields(), 1379 force=True, 1380 ) 1381 1382 # Create index after insertion 1383 self.create_indexes() 1384 1385 def get_explode_infos(self) -> bool: 1386 """ 1387 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1388 to False if it is not set. 1389 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1390 value. If the parameter is not present, it will return False. 1391 """ 1392 1393 return self.get_param().get("explode", {}).get("explode_infos", False) 1394 1395 def get_explode_infos_fields( 1396 self, 1397 explode_infos_fields: str = None, 1398 remove_fields_not_in_header: bool = False, 1399 ) -> list: 1400 """ 1401 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1402 the input parameter `explode_infos_fields`. 1403 1404 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1405 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1406 comma-separated list of field names to explode 1407 :type explode_infos_fields: str 1408 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1409 flag that determines whether to remove fields that are not present in the header. If it is set 1410 to `True`, any field that is not in the header will be excluded from the list of exploded 1411 information fields. If it is set to `, defaults to False 1412 :type remove_fields_not_in_header: bool (optional) 1413 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1414 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1415 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1416 Otherwise, it returns a list of exploded information fields after removing any spaces and 1417 splitting the string by commas. 1418 """ 1419 1420 # If no fields, get it in param 1421 if not explode_infos_fields: 1422 explode_infos_fields = ( 1423 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1424 ) 1425 1426 # If no fields, defined as all fields in header using keyword 1427 if not explode_infos_fields: 1428 explode_infos_fields = "*" 1429 1430 # If fields list not empty 1431 if explode_infos_fields: 1432 1433 # Input fields list 1434 if isinstance(explode_infos_fields, str): 1435 fields_input = explode_infos_fields.split(",") 1436 elif isinstance(explode_infos_fields, list): 1437 fields_input = explode_infos_fields 1438 else: 1439 fields_input = [] 1440 1441 # Fields list without * keyword 1442 fields_without_all = fields_input.copy() 1443 if "*".casefold() in (item.casefold() for item in fields_without_all): 1444 fields_without_all.remove("*") 1445 1446 # Fields in header 1447 fields_in_header = sorted(list(set(self.get_header().infos))) 1448 1449 # Construct list of fields 1450 fields_output = [] 1451 for field in fields_input: 1452 1453 # Strip field 1454 field = field.strip() 1455 1456 # format keyword * in regex 1457 if field.upper() in ["*"]: 1458 field = ".*" 1459 1460 # Find all fields with pattern 1461 r = re.compile(field) 1462 fields_search = sorted(list(filter(r.match, fields_in_header))) 1463 1464 # Remove fields input from search 1465 if field in fields_search: 1466 fields_search = [field] 1467 elif fields_search != [field]: 1468 fields_search = sorted( 1469 list(set(fields_search).difference(fields_input)) 1470 ) 1471 1472 # If field is not in header (avoid not well formatted header) 1473 if not fields_search and not remove_fields_not_in_header: 1474 fields_search = [field] 1475 1476 # Add found fields 1477 for new_field in fields_search: 1478 # Add field, if not already exists, and if it is in header (if asked) 1479 if ( 1480 new_field not in fields_output 1481 and ( 1482 not remove_fields_not_in_header 1483 or new_field in fields_in_header 1484 ) 1485 and new_field not in [".*"] 1486 ): 1487 fields_output.append(new_field) 1488 1489 return fields_output 1490 1491 else: 1492 1493 return [] 1494 1495 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1496 """ 1497 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1498 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1499 not provided. 1500 1501 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1502 prefix to be used for exploding or expanding information 1503 :type explode_infos_prefix: str 1504 :return: the value of the variable `explode_infos_prefix`. 1505 """ 1506 1507 if not explode_infos_prefix: 1508 explode_infos_prefix = ( 1509 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1510 ) 1511 1512 return explode_infos_prefix 1513 1514 def add_column( 1515 self, 1516 table_name, 1517 column_name, 1518 column_type, 1519 default_value=None, 1520 drop: bool = False, 1521 ) -> dict: 1522 """ 1523 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1524 doesn't already exist. 1525 1526 :param table_name: The name of the table to which you want to add a column 1527 :param column_name: The parameter "column_name" is the name of the column that you want to add 1528 to the table 1529 :param column_type: The `column_type` parameter specifies the data type of the column that you 1530 want to add to the table. It should be a string that represents the desired data type, such as 1531 "INTEGER", "TEXT", "REAL", etc 1532 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1533 default value for the newly added column. If a default value is provided, it will be assigned to 1534 the column for any existing rows that do not have a value for that column 1535 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1536 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1537 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1538 to False 1539 :type drop: bool (optional) 1540 :return: a boolean value indicating whether the column was successfully added to the table. 1541 """ 1542 1543 # added 1544 added = False 1545 dropped = False 1546 1547 # Check if the column already exists in the table 1548 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1549 columns = self.get_query_to_df(query).columns.tolist() 1550 if column_name.upper() in [c.upper() for c in columns]: 1551 log.debug( 1552 f"The {column_name} column already exists in the {table_name} table" 1553 ) 1554 if drop: 1555 self.drop_column(table_name=table_name, column_name=column_name) 1556 dropped = True 1557 else: 1558 return None 1559 else: 1560 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1561 1562 # Add column in table 1563 add_column_query = ( 1564 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1565 ) 1566 if default_value is not None: 1567 add_column_query += f" DEFAULT {default_value}" 1568 self.execute_query(add_column_query) 1569 added = not dropped 1570 log.debug( 1571 f"The {column_name} column was successfully added to the {table_name} table" 1572 ) 1573 1574 if added: 1575 added_column = { 1576 "table_name": table_name, 1577 "column_name": column_name, 1578 "column_type": column_type, 1579 "default_value": default_value, 1580 } 1581 else: 1582 added_column = None 1583 1584 return added_column 1585 1586 def drop_column( 1587 self, column: dict = None, table_name: str = None, column_name: str = None 1588 ) -> bool: 1589 """ 1590 The `drop_column` function drops a specified column from a given table in a database and returns 1591 True if the column was successfully dropped, and False if the column does not exist in the 1592 table. 1593 1594 :param column: The `column` parameter is a dictionary that contains information about the column 1595 you want to drop. It has two keys: 1596 :type column: dict 1597 :param table_name: The `table_name` parameter is the name of the table from which you want to 1598 drop a column 1599 :type table_name: str 1600 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1601 from the table 1602 :type column_name: str 1603 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1604 and False if the column does not exist in the table. 1605 """ 1606 1607 # Find column infos 1608 if column: 1609 if isinstance(column, dict): 1610 table_name = column.get("table_name", None) 1611 column_name = column.get("column_name", None) 1612 elif isinstance(column, str): 1613 table_name = self.get_table_variants() 1614 column_name = column 1615 else: 1616 table_name = None 1617 column_name = None 1618 1619 if not table_name and not column_name: 1620 return False 1621 1622 # Removed 1623 removed = False 1624 1625 # Check if the column already exists in the table 1626 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1627 columns = self.get_query_to_df(query).columns.tolist() 1628 if column_name in columns: 1629 log.debug(f"The {column_name} column exists in the {table_name} table") 1630 else: 1631 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1632 return False 1633 1634 # Add column in table # ALTER TABLE integers DROP k 1635 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1636 self.execute_query(add_column_query) 1637 removed = True 1638 log.debug( 1639 f"The {column_name} column was successfully dropped to the {table_name} table" 1640 ) 1641 1642 return removed 1643 1644 def explode_infos( 1645 self, 1646 prefix: str = None, 1647 create_index: bool = False, 1648 fields: list = None, 1649 force: bool = False, 1650 proccess_all_fields_together: bool = False, 1651 table: str = None, 1652 ) -> list: 1653 """ 1654 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1655 individual columns, returning a list of added columns. 1656 1657 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1658 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1659 `self.get_explode_infos_prefix()` as the prefix 1660 :type prefix: str 1661 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1662 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1663 `False`, indexes will not be created. The default value is `False`, defaults to False 1664 :type create_index: bool (optional) 1665 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1666 that you want to explode into individual columns. If this parameter is not provided, all INFO 1667 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1668 a list to the ` 1669 :type fields: list 1670 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1671 determines whether to drop and recreate a column if it already exists in the table. If `force` 1672 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1673 defaults to False 1674 :type force: bool (optional) 1675 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1676 flag that determines whether to process all the INFO fields together or individually. If set to 1677 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1678 be processed individually. The default value is, defaults to False 1679 :type proccess_all_fields_together: bool (optional) 1680 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1681 of the table where the exploded INFO fields will be added as individual columns. If you provide 1682 a value for the `table` parameter, the function will use that table name. If the `table` 1683 parameter is 1684 :type table: str 1685 :return: The `explode_infos` function returns a list of added columns. 1686 """ 1687 1688 # drop indexes 1689 self.drop_indexes() 1690 1691 # connexion format 1692 connexion_format = self.get_connexion_format() 1693 1694 # Access 1695 access = self.get_config().get("access", None) 1696 1697 # Added columns 1698 added_columns = [] 1699 1700 if access not in ["RO"]: 1701 1702 # prefix 1703 if prefix in [None, True] or not isinstance(prefix, str): 1704 if self.get_explode_infos_prefix() not in [None, True]: 1705 prefix = self.get_explode_infos_prefix() 1706 else: 1707 prefix = "INFO/" 1708 1709 # table variants 1710 if table is not None: 1711 table_variants = table 1712 else: 1713 table_variants = self.get_table_variants(clause="select") 1714 1715 # extra infos 1716 try: 1717 extra_infos = self.get_extra_infos() 1718 except: 1719 extra_infos = [] 1720 1721 # Header infos 1722 header_infos = self.get_header().infos 1723 1724 log.debug( 1725 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1726 ) 1727 1728 sql_info_alter_table_array = [] 1729 1730 # Info fields to check 1731 fields_list = list(header_infos) 1732 if fields: 1733 fields_list += fields 1734 fields_list = set(fields_list) 1735 1736 # If no fields 1737 if not fields: 1738 fields = [] 1739 1740 # Translate fields if patterns 1741 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1742 1743 for info in fields: 1744 1745 info_id_sql = prefix + info 1746 1747 if ( 1748 info in fields_list 1749 or prefix + info in fields_list 1750 or info in extra_infos 1751 ): 1752 1753 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1754 1755 if info in header_infos: 1756 info_type = header_infos[info].type 1757 info_num = header_infos[info].num 1758 else: 1759 info_type = "String" 1760 info_num = 0 1761 1762 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1763 if info_num != 1: 1764 type_sql = "VARCHAR" 1765 1766 # Add field 1767 added_column = self.add_column( 1768 table_name=table_variants, 1769 column_name=info_id_sql, 1770 column_type=type_sql, 1771 default_value="null", 1772 drop=force, 1773 ) 1774 1775 if added_column: 1776 added_columns.append(added_column) 1777 1778 if added_column or force: 1779 1780 # add field to index 1781 self.index_additionnal_fields.append(info_id_sql) 1782 1783 # Update field array 1784 if connexion_format in ["duckdb"]: 1785 update_info_field = f""" 1786 "{info_id_sql}" = 1787 CASE 1788 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1789 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1790 END 1791 """ 1792 elif connexion_format in ["sqlite"]: 1793 update_info_field = f""" 1794 "{info_id_sql}" = 1795 CASE 1796 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1797 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1798 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1799 END 1800 """ 1801 1802 sql_info_alter_table_array.append(update_info_field) 1803 1804 if sql_info_alter_table_array: 1805 1806 # By chromosomes 1807 try: 1808 chromosomes_list = list( 1809 self.get_query_to_df( 1810 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1811 )["#CHROM"] 1812 ) 1813 except: 1814 chromosomes_list = [None] 1815 1816 for chrom in chromosomes_list: 1817 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1818 1819 # Where clause 1820 where_clause = "" 1821 if chrom and len(chromosomes_list) > 1: 1822 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1823 1824 # Update table 1825 if proccess_all_fields_together: 1826 sql_info_alter_table_array_join = ", ".join( 1827 sql_info_alter_table_array 1828 ) 1829 if sql_info_alter_table_array_join: 1830 sql_info_alter_table = f""" 1831 UPDATE {table_variants} 1832 SET {sql_info_alter_table_array_join} 1833 {where_clause} 1834 """ 1835 log.debug( 1836 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1837 ) 1838 # log.debug(sql_info_alter_table) 1839 self.conn.execute(sql_info_alter_table) 1840 else: 1841 sql_info_alter_num = 0 1842 for sql_info_alter in sql_info_alter_table_array: 1843 sql_info_alter_num += 1 1844 sql_info_alter_table = f""" 1845 UPDATE {table_variants} 1846 SET {sql_info_alter} 1847 {where_clause} 1848 """ 1849 log.debug( 1850 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1851 ) 1852 # log.debug(sql_info_alter_table) 1853 self.conn.execute(sql_info_alter_table) 1854 1855 # create indexes 1856 if create_index: 1857 self.create_indexes() 1858 1859 return added_columns 1860 1861 def create_indexes(self) -> None: 1862 """ 1863 Create indexes on the table after insertion 1864 """ 1865 1866 # Access 1867 access = self.get_config().get("access", None) 1868 1869 # get table variants 1870 table_variants = self.get_table_variants("FROM") 1871 1872 if self.get_indexing() and access not in ["RO"]: 1873 # Create index 1874 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1875 self.conn.execute(sql_create_table_index) 1876 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1877 self.conn.execute(sql_create_table_index) 1878 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1879 self.conn.execute(sql_create_table_index) 1880 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1881 self.conn.execute(sql_create_table_index) 1882 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1883 self.conn.execute(sql_create_table_index) 1884 for field in self.index_additionnal_fields: 1885 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1886 self.conn.execute(sql_create_table_index) 1887 1888 def drop_indexes(self) -> None: 1889 """ 1890 Create indexes on the table after insertion 1891 """ 1892 1893 # Access 1894 access = self.get_config().get("access", None) 1895 1896 # get table variants 1897 table_variants = self.get_table_variants("FROM") 1898 1899 # Get database format 1900 connexion_format = self.get_connexion_format() 1901 1902 if access not in ["RO"]: 1903 if connexion_format in ["duckdb"]: 1904 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1905 elif connexion_format in ["sqlite"]: 1906 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1907 1908 list_indexes = self.conn.execute(sql_list_indexes) 1909 index_names = [row[0] for row in list_indexes.fetchall()] 1910 for index in index_names: 1911 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1912 self.conn.execute(sql_drop_table_index) 1913 1914 def read_vcf_header(self, f) -> list: 1915 """ 1916 It reads the header of a VCF file and returns a list of the header lines 1917 1918 :param f: the file object 1919 :return: The header lines of the VCF file. 1920 """ 1921 1922 header_list = [] 1923 for line in f: 1924 header_list.append(line) 1925 if line.startswith("#CHROM"): 1926 break 1927 return header_list 1928 1929 def read_vcf_header_file(self, file: str = None) -> list: 1930 """ 1931 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 1932 uncompressed files. 1933 1934 :param file: The `file` parameter is a string that represents the path to the VCF header file 1935 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1936 default to `None` 1937 :type file: str 1938 :return: The function `read_vcf_header_file` returns a list. 1939 """ 1940 1941 if self.get_input_compressed(input_file=file): 1942 with bgzf.open(file, "rt") as f: 1943 return self.read_vcf_header(f=f) 1944 else: 1945 with open(file, "rt") as f: 1946 return self.read_vcf_header(f=f) 1947 1948 def execute_query(self, query: str): 1949 """ 1950 It takes a query as an argument, executes it, and returns the results 1951 1952 :param query: The query to be executed 1953 :return: The result of the query is being returned. 1954 """ 1955 if query: 1956 return self.conn.execute(query) # .fetchall() 1957 else: 1958 return None 1959 1960 def export_output( 1961 self, 1962 output_file: str | None = None, 1963 output_header: str | None = None, 1964 export_header: bool = True, 1965 query: str | None = None, 1966 parquet_partitions: list | None = None, 1967 chunk_size: int | None = None, 1968 threads: int | None = None, 1969 sort: bool = False, 1970 index: bool = False, 1971 order_by: str | None = None, 1972 ) -> bool: 1973 """ 1974 The `export_output` function exports data from a VCF file to a specified output file in various 1975 formats, including VCF, CSV, TSV, PSV, and Parquet. 1976 1977 :param output_file: The `output_file` parameter is a string that specifies the name of the 1978 output file to be generated by the function. This is where the exported data will be saved 1979 :type output_file: str 1980 :param output_header: The `output_header` parameter is a string that specifies the name of the 1981 file where the header of the VCF file will be exported. If this parameter is not provided, the 1982 header will be exported to a file with the same name as the `output_file` parameter, but with 1983 the extension " 1984 :type output_header: str 1985 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1986 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1987 True, the header will be exported to a file. If `export_header` is False, the header will not 1988 be, defaults to True, if output format is not VCF 1989 :type export_header: bool (optional) 1990 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1991 select specific data from the VCF file before exporting it. If provided, only the data that 1992 matches the query will be exported 1993 :type query: str 1994 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1995 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1996 organize data in a hierarchical directory structure based on the values of one or more columns. 1997 This can improve query performance when working with large datasets 1998 :type parquet_partitions: list 1999 :param chunk_size: The `chunk_size` parameter specifies the number of 2000 records in batch when exporting data in Parquet format. This parameter is used for 2001 partitioning the Parquet file into multiple files. 2002 :type chunk_size: int 2003 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2004 threads to be used during the export process. It determines the level of parallelism and can 2005 improve the performance of the export operation. If not provided, the function will use the 2006 default number of threads 2007 :type threads: int 2008 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2009 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2010 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2011 False 2012 :type sort: bool (optional) 2013 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2014 created on the output file. If `index` is True, an index will be created. If `index` is False, 2015 no index will be created. The default value is False, defaults to False 2016 :type index: bool (optional) 2017 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2018 sorting the output file. This parameter is only applicable when exporting data in VCF format 2019 :type order_by: str 2020 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2021 None if it doesn't. 2022 """ 2023 2024 # Log 2025 log.info("Exporting...") 2026 2027 # Full path 2028 output_file = full_path(output_file) 2029 output_header = full_path(output_header) 2030 2031 # Config 2032 config = self.get_config() 2033 2034 # Param 2035 param = self.get_param() 2036 2037 # Tmp files to remove 2038 tmp_to_remove = [] 2039 2040 # If no output, get it 2041 if not output_file: 2042 output_file = self.get_output() 2043 2044 # If not threads 2045 if not threads: 2046 threads = self.get_threads() 2047 2048 # Auto header name with extension 2049 if export_header or output_header: 2050 if not output_header: 2051 output_header = f"{output_file}.hdr" 2052 # Export header 2053 self.export_header(output_file=output_file) 2054 2055 # Switch off export header if VCF output 2056 output_file_type = get_file_format(output_file) 2057 if output_file_type in ["vcf"]: 2058 export_header = False 2059 tmp_to_remove.append(output_header) 2060 2061 # Chunk size 2062 if not chunk_size: 2063 chunk_size = config.get("chunk_size", None) 2064 2065 # Parquet partition 2066 if not parquet_partitions: 2067 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2068 if parquet_partitions and isinstance(parquet_partitions, str): 2069 parquet_partitions = parquet_partitions.split(",") 2070 2071 # Order by 2072 if not order_by: 2073 order_by = param.get("export", {}).get("order_by", "") 2074 2075 # Header in output 2076 header_in_output = param.get("export", {}).get("include_header", False) 2077 2078 # Database 2079 database_source = self.get_connexion() 2080 2081 # Connexion format 2082 connexion_format = self.get_connexion_format() 2083 2084 # Explode infos 2085 if self.get_explode_infos(): 2086 self.explode_infos( 2087 prefix=self.get_explode_infos_prefix(), 2088 fields=self.get_explode_infos_fields(), 2089 force=False, 2090 ) 2091 2092 # if connexion_format in ["sqlite"] or query: 2093 if connexion_format in ["sqlite"]: 2094 2095 # Export in Parquet 2096 random_tmp = "".join( 2097 random.choice(string.ascii_lowercase) for i in range(10) 2098 ) 2099 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2100 tmp_to_remove.append(database_source) 2101 2102 # Table Variants 2103 table_variants = self.get_table_variants() 2104 2105 # Create export query 2106 sql_query_export_subquery = f""" 2107 SELECT * FROM {table_variants} 2108 """ 2109 2110 # Write source file 2111 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2112 2113 # Create database 2114 database = Database( 2115 database=database_source, 2116 table="variants", 2117 header_file=output_header, 2118 conn_config=self.get_connexion_config(), 2119 ) 2120 2121 # Existing colomns header 2122 # existing_columns_header = database.get_header_file_columns(output_header) 2123 existing_columns_header = database.get_header_columns_from_database() 2124 2125 # Export file 2126 database.export( 2127 output_database=output_file, 2128 output_header=output_header, 2129 existing_columns_header=existing_columns_header, 2130 parquet_partitions=parquet_partitions, 2131 chunk_size=chunk_size, 2132 threads=threads, 2133 sort=sort, 2134 index=index, 2135 header_in_output=header_in_output, 2136 order_by=order_by, 2137 query=query, 2138 export_header=export_header, 2139 ) 2140 2141 # Remove 2142 remove_if_exists(tmp_to_remove) 2143 2144 return (os.path.exists(output_file) or None) and ( 2145 os.path.exists(output_file) or None 2146 ) 2147 2148 def get_extra_infos(self, table: str = None) -> list: 2149 """ 2150 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2151 in the header. 2152 2153 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2154 name of the table from which you want to retrieve the extra columns that are not present in the 2155 header. If the `table` parameter is not provided when calling the function, it will default to 2156 using the variants 2157 :type table: str 2158 :return: A list of columns that are in the specified table but not in the header of the table. 2159 """ 2160 2161 header_columns = [] 2162 2163 if not table: 2164 table = self.get_table_variants(clause="from") 2165 header_columns = self.get_header_columns() 2166 2167 # Check all columns in the database 2168 query = f""" SELECT * FROM {table} LIMIT 1 """ 2169 log.debug(f"query {query}") 2170 table_columns = self.get_query_to_df(query).columns.tolist() 2171 extra_columns = [] 2172 2173 # Construct extra infos (not in header) 2174 for column in table_columns: 2175 if column not in header_columns: 2176 extra_columns.append(column) 2177 2178 return extra_columns 2179 2180 def get_extra_infos_sql(self, table: str = None) -> str: 2181 """ 2182 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2183 by double quotes 2184 2185 :param table: The name of the table to get the extra infos from. If None, the default table is 2186 used 2187 :type table: str 2188 :return: A string of the extra infos 2189 """ 2190 2191 return ", ".join( 2192 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2193 ) 2194 2195 def export_header( 2196 self, 2197 header_name: str = None, 2198 output_file: str = None, 2199 output_file_ext: str = ".hdr", 2200 clean_header: bool = True, 2201 remove_chrom_line: bool = False, 2202 ) -> str: 2203 """ 2204 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2205 specified options, and writes it to a new file. 2206 2207 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2208 this parameter is not specified, the header will be written to the output file 2209 :type header_name: str 2210 :param output_file: The `output_file` parameter in the `export_header` function is used to 2211 specify the name of the output file where the header will be written. If this parameter is not 2212 provided, the header will be written to a temporary file 2213 :type output_file: str 2214 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2215 string that represents the extension of the output header file. By default, it is set to ".hdr" 2216 if not specified by the user. This extension will be appended to the `output_file` name to 2217 create the final, defaults to .hdr 2218 :type output_file_ext: str (optional) 2219 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2220 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2221 `True`, the function will clean the header by modifying certain lines based on a specific 2222 pattern. If `clean_header`, defaults to True 2223 :type clean_header: bool (optional) 2224 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2225 boolean flag that determines whether the #CHROM line should be removed from the header before 2226 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2227 defaults to False 2228 :type remove_chrom_line: bool (optional) 2229 :return: The function `export_header` returns the name of the temporary header file that is 2230 created. 2231 """ 2232 2233 if not header_name and not output_file: 2234 output_file = self.get_output() 2235 2236 if self.get_header(): 2237 2238 # Get header object 2239 header_obj = self.get_header() 2240 2241 # Create database 2242 db_for_header = Database(database=self.get_input()) 2243 2244 # Get real columns in the file 2245 db_header_columns = db_for_header.get_columns() 2246 2247 with tempfile.TemporaryDirectory() as tmpdir: 2248 2249 # Write header file 2250 header_file_tmp = os.path.join(tmpdir, "header") 2251 f = open(header_file_tmp, "w") 2252 vcf.Writer(f, header_obj) 2253 f.close() 2254 2255 # Replace #CHROM line with rel columns 2256 header_list = db_for_header.read_header_file( 2257 header_file=header_file_tmp 2258 ) 2259 header_list[-1] = "\t".join(db_header_columns) 2260 2261 # Remove CHROM line 2262 if remove_chrom_line: 2263 header_list.pop() 2264 2265 # Clean header 2266 if clean_header: 2267 header_list_clean = [] 2268 for head in header_list: 2269 # Clean head for malformed header 2270 head_clean = head 2271 head_clean = re.subn( 2272 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2273 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2274 head_clean, 2275 2, 2276 )[0] 2277 # Write header 2278 header_list_clean.append(head_clean) 2279 header_list = header_list_clean 2280 2281 tmp_header_name = output_file + output_file_ext 2282 2283 f = open(tmp_header_name, "w") 2284 for line in header_list: 2285 f.write(line) 2286 f.close() 2287 2288 return tmp_header_name 2289 2290 def export_variant_vcf( 2291 self, 2292 vcf_file, 2293 remove_info: bool = False, 2294 add_samples: bool = True, 2295 list_samples: list = [], 2296 where_clause: str = "", 2297 index: bool = False, 2298 threads: int | None = None, 2299 ) -> bool | None: 2300 """ 2301 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2302 remove INFO field, add samples, and control compression and indexing. 2303 2304 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2305 written to. It is the output file that will contain the filtered VCF data based on the specified 2306 parameters 2307 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2308 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2309 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2310 in, defaults to False 2311 :type remove_info: bool (optional) 2312 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2313 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2314 If set to False, the samples will be removed. The default value is True, defaults to True 2315 :type add_samples: bool (optional) 2316 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2317 in the output VCF file. By default, all samples will be included. If you provide a list of 2318 samples, only those samples will be included in the output file 2319 :type list_samples: list 2320 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2321 determines whether or not to create an index for the output VCF file. If `index` is set to 2322 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2323 :type index: bool (optional) 2324 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2325 number of threads to use for exporting the VCF file. It determines how many parallel threads 2326 will be used during the export process. More threads can potentially speed up the export process 2327 by utilizing multiple cores of the processor. If 2328 :type threads: int | None 2329 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2330 method with various parameters including the output file, query, threads, sort flag, and index 2331 flag. The `export_output` method is responsible for exporting the VCF data based on the 2332 specified parameters and configurations provided in the `export_variant_vcf` function. 2333 """ 2334 2335 # Config 2336 config = self.get_config() 2337 2338 # Extract VCF 2339 log.debug("Export VCF...") 2340 2341 # Table variants 2342 table_variants = self.get_table_variants() 2343 2344 # Threads 2345 if not threads: 2346 threads = self.get_threads() 2347 2348 # Info fields 2349 if remove_info: 2350 if not isinstance(remove_info, str): 2351 remove_info = "." 2352 info_field = f"""'{remove_info}' as INFO""" 2353 else: 2354 info_field = "INFO" 2355 2356 # Samples fields 2357 if add_samples: 2358 if not list_samples: 2359 list_samples = self.get_header_sample_list() 2360 if list_samples: 2361 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2362 else: 2363 samples_fields = "" 2364 log.debug(f"samples_fields: {samples_fields}") 2365 else: 2366 samples_fields = "" 2367 2368 # Where clause 2369 if where_clause is None: 2370 where_clause = "" 2371 2372 # Variants 2373 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2374 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2375 log.debug(f"sql_query_select={sql_query_select}") 2376 2377 return self.export_output( 2378 output_file=vcf_file, 2379 output_header=None, 2380 export_header=True, 2381 query=sql_query_select, 2382 parquet_partitions=None, 2383 chunk_size=config.get("chunk_size", None), 2384 threads=threads, 2385 sort=True, 2386 index=index, 2387 order_by=None, 2388 ) 2389 2390 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2391 """ 2392 It takes a list of commands and runs them in parallel using the number of threads specified 2393 2394 :param commands: A list of commands to run 2395 :param threads: The number of threads to use, defaults to 1 (optional) 2396 """ 2397 2398 run_parallel_commands(commands, threads) 2399 2400 def get_threads(self, default: int = 1) -> int: 2401 """ 2402 This function returns the number of threads to use for a job, with a default value of 1 if not 2403 specified. 2404 2405 :param default: The `default` parameter in the `get_threads` method is used to specify the 2406 default number of threads to use if no specific value is provided. If no value is provided for 2407 the `threads` parameter in the configuration or input parameters, the `default` value will be 2408 used, defaults to 1 2409 :type default: int (optional) 2410 :return: the number of threads to use for the current job. 2411 """ 2412 2413 # Config 2414 config = self.get_config() 2415 2416 # Param 2417 param = self.get_param() 2418 2419 # Input threads 2420 input_thread = param.get("threads", config.get("threads", None)) 2421 2422 # Check threads 2423 if not input_thread: 2424 threads = default 2425 elif int(input_thread) <= 0: 2426 threads = os.cpu_count() 2427 else: 2428 threads = int(input_thread) 2429 return threads 2430 2431 def get_memory(self, default: str = None) -> str: 2432 """ 2433 This function retrieves the memory value from parameters or configuration with a default value 2434 if not found. 2435 2436 :param default: The `get_memory` function takes in a default value as a string parameter. This 2437 default value is used as a fallback in case the `memory` parameter is not provided in the 2438 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2439 the function 2440 :type default: str 2441 :return: The `get_memory` function returns a string value representing the memory parameter. If 2442 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2443 return the default value provided as an argument to the function. 2444 """ 2445 2446 # Config 2447 config = self.get_config() 2448 2449 # Param 2450 param = self.get_param() 2451 2452 # Input threads 2453 input_memory = param.get("memory", config.get("memory", None)) 2454 2455 # Check threads 2456 if input_memory: 2457 memory = input_memory 2458 else: 2459 memory = default 2460 2461 return memory 2462 2463 def update_from_vcf(self, vcf_file: str) -> None: 2464 """ 2465 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2466 2467 :param vcf_file: the path to the VCF file 2468 """ 2469 2470 connexion_format = self.get_connexion_format() 2471 2472 if connexion_format in ["duckdb"]: 2473 self.update_from_vcf_duckdb(vcf_file) 2474 elif connexion_format in ["sqlite"]: 2475 self.update_from_vcf_sqlite(vcf_file) 2476 2477 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2478 """ 2479 It takes a VCF file and updates the INFO column of the variants table in the database with the 2480 INFO column of the VCF file 2481 2482 :param vcf_file: the path to the VCF file 2483 """ 2484 2485 # varaints table 2486 table_variants = self.get_table_variants() 2487 2488 # Loading VCF into temporaire table 2489 skip = self.get_header_length(file=vcf_file) 2490 vcf_df = pd.read_csv( 2491 vcf_file, 2492 sep="\t", 2493 engine="c", 2494 skiprows=skip, 2495 header=0, 2496 low_memory=False, 2497 ) 2498 sql_query_update = f""" 2499 UPDATE {table_variants} as table_variants 2500 SET INFO = concat( 2501 CASE 2502 WHEN INFO NOT IN ('', '.') 2503 THEN INFO 2504 ELSE '' 2505 END, 2506 ( 2507 SELECT 2508 concat( 2509 CASE 2510 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2511 THEN ';' 2512 ELSE '' 2513 END 2514 , 2515 CASE 2516 WHEN table_parquet.INFO NOT IN ('','.') 2517 THEN table_parquet.INFO 2518 ELSE '' 2519 END 2520 ) 2521 FROM vcf_df as table_parquet 2522 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2523 AND table_parquet.\"POS\" = table_variants.\"POS\" 2524 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2525 AND table_parquet.\"REF\" = table_variants.\"REF\" 2526 AND table_parquet.INFO NOT IN ('','.') 2527 ) 2528 ) 2529 ; 2530 """ 2531 self.conn.execute(sql_query_update) 2532 2533 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2534 """ 2535 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2536 table, then updates the INFO column of the variants table with the INFO column of the temporary 2537 table 2538 2539 :param vcf_file: The path to the VCF file you want to update the database with 2540 """ 2541 2542 # Create a temporary table for the VCF 2543 table_vcf = "tmp_vcf" 2544 sql_create = ( 2545 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2546 ) 2547 self.conn.execute(sql_create) 2548 2549 # Loading VCF into temporaire table 2550 vcf_df = pd.read_csv( 2551 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2552 ) 2553 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2554 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2555 2556 # Update table 'variants' with VCF data 2557 # warning: CONCAT as || operator 2558 sql_query_update = f""" 2559 UPDATE variants as table_variants 2560 SET INFO = CASE 2561 WHEN INFO NOT IN ('', '.') 2562 THEN INFO 2563 ELSE '' 2564 END || 2565 ( 2566 SELECT 2567 CASE 2568 WHEN table_variants.INFO NOT IN ('','.') 2569 AND table_vcf.INFO NOT IN ('','.') 2570 THEN ';' 2571 ELSE '' 2572 END || 2573 CASE 2574 WHEN table_vcf.INFO NOT IN ('','.') 2575 THEN table_vcf.INFO 2576 ELSE '' 2577 END 2578 FROM {table_vcf} as table_vcf 2579 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2580 AND table_vcf.\"POS\" = table_variants.\"POS\" 2581 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2582 AND table_vcf.\"REF\" = table_variants.\"REF\" 2583 ) 2584 """ 2585 self.conn.execute(sql_query_update) 2586 2587 # Drop temporary table 2588 sql_drop = f"DROP TABLE {table_vcf}" 2589 self.conn.execute(sql_drop) 2590 2591 def drop_variants_table(self) -> None: 2592 """ 2593 > This function drops the variants table 2594 """ 2595 2596 table_variants = self.get_table_variants() 2597 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2598 self.conn.execute(sql_table_variants) 2599 2600 def set_variant_id( 2601 self, variant_id_column: str = "variant_id", force: bool = None 2602 ) -> str: 2603 """ 2604 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2605 `#CHROM`, `POS`, `REF`, and `ALT` columns 2606 2607 :param variant_id_column: The name of the column to be created in the variants table, defaults 2608 to variant_id 2609 :type variant_id_column: str (optional) 2610 :param force: If True, the variant_id column will be created even if it already exists 2611 :type force: bool 2612 :return: The name of the column that contains the variant_id 2613 """ 2614 2615 # Assembly 2616 assembly = self.get_param().get( 2617 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2618 ) 2619 2620 # INFO/Tag prefix 2621 prefix = self.get_explode_infos_prefix() 2622 2623 # Explode INFO/SVTYPE 2624 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2625 2626 # variants table 2627 table_variants = self.get_table_variants() 2628 2629 # variant_id column 2630 if not variant_id_column: 2631 variant_id_column = "variant_id" 2632 2633 # Creta variant_id column 2634 if "variant_id" not in self.get_extra_infos() or force: 2635 2636 # Create column 2637 self.add_column( 2638 table_name=table_variants, 2639 column_name=variant_id_column, 2640 column_type="UBIGINT", 2641 default_value="0", 2642 ) 2643 2644 # Update column 2645 self.conn.execute( 2646 f""" 2647 UPDATE {table_variants} 2648 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2649 """ 2650 ) 2651 2652 # Remove added columns 2653 for added_column in added_columns: 2654 self.drop_column(column=added_column) 2655 2656 # return variant_id column name 2657 return variant_id_column 2658 2659 def get_variant_id_column( 2660 self, variant_id_column: str = "variant_id", force: bool = None 2661 ) -> str: 2662 """ 2663 This function returns the variant_id column name 2664 2665 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2666 defaults to variant_id 2667 :type variant_id_column: str (optional) 2668 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2669 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2670 if it is not already set, or if it is set 2671 :type force: bool 2672 :return: The variant_id column name. 2673 """ 2674 2675 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2676 2677 ### 2678 # Annotation 2679 ### 2680 2681 def scan_databases( 2682 self, 2683 database_formats: list = ["parquet"], 2684 database_releases: list = ["current"], 2685 ) -> dict: 2686 """ 2687 The function `scan_databases` scans for available databases based on specified formats and 2688 releases. 2689 2690 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2691 of the databases to be scanned. In this case, the accepted format is "parquet" 2692 :type database_formats: list ["parquet"] 2693 :param database_releases: The `database_releases` parameter is a list that specifies the 2694 releases of the databases to be scanned. In the provided function, the default value for 2695 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2696 databases that are in the "current" 2697 :type database_releases: list 2698 :return: The function `scan_databases` returns a dictionary containing information about 2699 databases that match the specified formats and releases. 2700 """ 2701 2702 # Config 2703 config = self.get_config() 2704 2705 # Param 2706 param = self.get_param() 2707 2708 # Param - Assembly 2709 assembly = param.get("assembly", config.get("assembly", None)) 2710 if not assembly: 2711 assembly = DEFAULT_ASSEMBLY 2712 log.warning(f"Default assembly '{assembly}'") 2713 2714 # Scan for availabled databases 2715 log.info( 2716 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2717 ) 2718 databases_infos_dict = databases_infos( 2719 database_folder_releases=database_releases, 2720 database_formats=database_formats, 2721 assembly=assembly, 2722 config=config, 2723 ) 2724 log.info( 2725 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2726 ) 2727 2728 return databases_infos_dict 2729 2730 def annotation(self) -> None: 2731 """ 2732 It annotates the VCF file with the annotations specified in the config file. 2733 """ 2734 2735 # Config 2736 config = self.get_config() 2737 2738 # Param 2739 param = self.get_param() 2740 2741 # Param - Assembly 2742 assembly = param.get("assembly", config.get("assembly", None)) 2743 if not assembly: 2744 assembly = DEFAULT_ASSEMBLY 2745 log.warning(f"Default assembly '{assembly}'") 2746 2747 # annotations databases folders 2748 annotations_databases = set( 2749 config.get("folders", {}) 2750 .get("databases", {}) 2751 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2752 + config.get("folders", {}) 2753 .get("databases", {}) 2754 .get("parquet", ["~/howard/databases/parquet/current"]) 2755 + config.get("folders", {}) 2756 .get("databases", {}) 2757 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2758 ) 2759 2760 # Get param annotations 2761 if param.get("annotations", None) and isinstance( 2762 param.get("annotations", None), str 2763 ): 2764 log.debug(param.get("annotations", None)) 2765 param_annotation_list = param.get("annotations").split(",") 2766 else: 2767 param_annotation_list = [] 2768 2769 # Each tools param 2770 if param.get("annotation_parquet", None) != None: 2771 log.debug( 2772 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2773 ) 2774 if isinstance(param.get("annotation_parquet", None), list): 2775 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2776 else: 2777 param_annotation_list.append(param.get("annotation_parquet")) 2778 if param.get("annotation_snpsift", None) != None: 2779 if isinstance(param.get("annotation_snpsift", None), list): 2780 param_annotation_list.append( 2781 "snpsift:" 2782 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2783 ) 2784 else: 2785 param_annotation_list.append( 2786 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2787 ) 2788 if param.get("annotation_snpeff", None) != None: 2789 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2790 if param.get("annotation_bcftools", None) != None: 2791 if isinstance(param.get("annotation_bcftools", None), list): 2792 param_annotation_list.append( 2793 "bcftools:" 2794 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2795 ) 2796 else: 2797 param_annotation_list.append( 2798 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2799 ) 2800 if param.get("annotation_annovar", None) != None: 2801 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2802 if param.get("annotation_exomiser", None) != None: 2803 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2804 if param.get("annotation_splice", None) != None: 2805 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2806 2807 # Merge param annotations list 2808 param["annotations"] = ",".join(param_annotation_list) 2809 2810 # debug 2811 log.debug(f"param_annotations={param['annotations']}") 2812 2813 if param.get("annotations"): 2814 2815 # Log 2816 # log.info("Annotations - Check annotation parameters") 2817 2818 if not "annotation" in param: 2819 param["annotation"] = {} 2820 2821 # List of annotations parameters 2822 annotations_list_input = {} 2823 if isinstance(param.get("annotations", None), str): 2824 annotation_file_list = [ 2825 value for value in param.get("annotations", "").split(",") 2826 ] 2827 for annotation_file in annotation_file_list: 2828 annotations_list_input[annotation_file] = {"INFO": None} 2829 else: 2830 annotations_list_input = param.get("annotations", {}) 2831 2832 log.info(f"Quick Annotations:") 2833 for annotation_key in list(annotations_list_input.keys()): 2834 log.info(f" {annotation_key}") 2835 2836 # List of annotations and associated fields 2837 annotations_list = {} 2838 2839 for annotation_file in annotations_list_input: 2840 2841 # Explode annotations if ALL 2842 if ( 2843 annotation_file.upper() == "ALL" 2844 or annotation_file.upper().startswith("ALL:") 2845 ): 2846 2847 # check ALL parameters (formats, releases) 2848 annotation_file_split = annotation_file.split(":") 2849 database_formats = "parquet" 2850 database_releases = "current" 2851 for annotation_file_option in annotation_file_split[1:]: 2852 database_all_options_split = annotation_file_option.split("=") 2853 if database_all_options_split[0] == "format": 2854 database_formats = database_all_options_split[1].split("+") 2855 if database_all_options_split[0] == "release": 2856 database_releases = database_all_options_split[1].split("+") 2857 2858 # Scan for availabled databases 2859 databases_infos_dict = self.scan_databases( 2860 database_formats=database_formats, 2861 database_releases=database_releases, 2862 ) 2863 2864 # Add found databases in annotation parameters 2865 for database_infos in databases_infos_dict.keys(): 2866 annotations_list[database_infos] = {"INFO": None} 2867 2868 else: 2869 annotations_list[annotation_file] = annotations_list_input[ 2870 annotation_file 2871 ] 2872 2873 # Check each databases 2874 if len(annotations_list): 2875 2876 log.info( 2877 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2878 ) 2879 2880 for annotation_file in annotations_list: 2881 2882 # Init 2883 annotations = annotations_list.get(annotation_file, None) 2884 2885 # Annotation snpEff 2886 if annotation_file.startswith("snpeff"): 2887 2888 log.debug(f"Quick Annotation snpEff") 2889 2890 if "snpeff" not in param["annotation"]: 2891 param["annotation"]["snpeff"] = {} 2892 2893 if "options" not in param["annotation"]["snpeff"]: 2894 param["annotation"]["snpeff"]["options"] = "" 2895 2896 # snpEff options in annotations 2897 param["annotation"]["snpeff"]["options"] = "".join( 2898 annotation_file.split(":")[1:] 2899 ) 2900 2901 # Annotation Annovar 2902 elif annotation_file.startswith("annovar"): 2903 2904 log.debug(f"Quick Annotation Annovar") 2905 2906 if "annovar" not in param["annotation"]: 2907 param["annotation"]["annovar"] = {} 2908 2909 if "annotations" not in param["annotation"]["annovar"]: 2910 param["annotation"]["annovar"]["annotations"] = {} 2911 2912 # Options 2913 annotation_file_split = annotation_file.split(":") 2914 for annotation_file_annotation in annotation_file_split[1:]: 2915 if annotation_file_annotation: 2916 param["annotation"]["annovar"]["annotations"][ 2917 annotation_file_annotation 2918 ] = annotations 2919 2920 # Annotation Exomiser 2921 elif annotation_file.startswith("exomiser"): 2922 2923 log.debug(f"Quick Annotation Exomiser") 2924 2925 param["annotation"]["exomiser"] = params_string_to_dict( 2926 annotation_file 2927 ) 2928 2929 # Annotation Splice 2930 elif annotation_file.startswith("splice"): 2931 2932 log.debug(f"Quick Annotation Splice") 2933 2934 param["annotation"]["splice"] = params_string_to_dict( 2935 annotation_file 2936 ) 2937 2938 # Annotation Parquet or BCFTOOLS 2939 else: 2940 2941 # Tools detection 2942 if annotation_file.startswith("bcftools:"): 2943 annotation_tool_initial = "bcftools" 2944 annotation_file = ":".join(annotation_file.split(":")[1:]) 2945 elif annotation_file.startswith("snpsift:"): 2946 annotation_tool_initial = "snpsift" 2947 annotation_file = ":".join(annotation_file.split(":")[1:]) 2948 else: 2949 annotation_tool_initial = None 2950 2951 # list of files 2952 annotation_file_list = annotation_file.replace("+", ":").split( 2953 ":" 2954 ) 2955 2956 for annotation_file in annotation_file_list: 2957 2958 if annotation_file: 2959 2960 # Annotation tool initial 2961 annotation_tool = annotation_tool_initial 2962 2963 # Find file 2964 annotation_file_found = None 2965 2966 # Expand user 2967 annotation_file = full_path(annotation_file) 2968 2969 if os.path.exists(annotation_file): 2970 annotation_file_found = annotation_file 2971 2972 else: 2973 # Find within assembly folders 2974 for annotations_database in annotations_databases: 2975 found_files = find_all( 2976 annotation_file, 2977 os.path.join( 2978 annotations_database, assembly 2979 ), 2980 ) 2981 if len(found_files) > 0: 2982 annotation_file_found = found_files[0] 2983 break 2984 if not annotation_file_found and not assembly: 2985 # Find within folders 2986 for ( 2987 annotations_database 2988 ) in annotations_databases: 2989 found_files = find_all( 2990 annotation_file, annotations_database 2991 ) 2992 if len(found_files) > 0: 2993 annotation_file_found = found_files[0] 2994 break 2995 log.debug( 2996 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2997 ) 2998 2999 # Full path 3000 annotation_file_found = full_path(annotation_file_found) 3001 3002 if annotation_file_found: 3003 3004 database = Database(database=annotation_file_found) 3005 quick_annotation_format = database.get_format() 3006 quick_annotation_is_compressed = ( 3007 database.is_compressed() 3008 ) 3009 quick_annotation_is_indexed = os.path.exists( 3010 f"{annotation_file_found}.tbi" 3011 ) 3012 bcftools_preference = False 3013 3014 # Check Annotation Tool 3015 if not annotation_tool: 3016 if ( 3017 bcftools_preference 3018 and quick_annotation_format 3019 in ["vcf", "bed"] 3020 and quick_annotation_is_compressed 3021 and quick_annotation_is_indexed 3022 ): 3023 annotation_tool = "bcftools" 3024 elif quick_annotation_format in [ 3025 "vcf", 3026 "bed", 3027 "tsv", 3028 "tsv", 3029 "csv", 3030 "json", 3031 "tbl", 3032 "parquet", 3033 "duckdb", 3034 ]: 3035 annotation_tool = "parquet" 3036 else: 3037 log.error( 3038 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3039 ) 3040 raise ValueError( 3041 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3042 ) 3043 3044 log.debug( 3045 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3046 ) 3047 3048 # Annotation Tool dispatch 3049 if annotation_tool: 3050 if annotation_tool not in param["annotation"]: 3051 param["annotation"][annotation_tool] = {} 3052 if ( 3053 "annotations" 3054 not in param["annotation"][annotation_tool] 3055 ): 3056 param["annotation"][annotation_tool][ 3057 "annotations" 3058 ] = {} 3059 param["annotation"][annotation_tool][ 3060 "annotations" 3061 ][annotation_file_found] = annotations 3062 3063 else: 3064 log.error( 3065 f"Quick Annotation File {annotation_file} does NOT exist" 3066 ) 3067 3068 self.set_param(param) 3069 3070 if param.get("annotation", None): 3071 log.info("Annotations") 3072 if param.get("annotation", {}).get("parquet", None): 3073 log.info("Annotations 'parquet'...") 3074 self.annotation_parquet() 3075 if param.get("annotation", {}).get("bcftools", None): 3076 log.info("Annotations 'bcftools'...") 3077 self.annotation_bcftools() 3078 if param.get("annotation", {}).get("snpsift", None): 3079 log.info("Annotations 'snpsift'...") 3080 self.annotation_snpsift() 3081 if param.get("annotation", {}).get("annovar", None): 3082 log.info("Annotations 'annovar'...") 3083 self.annotation_annovar() 3084 if param.get("annotation", {}).get("snpeff", None): 3085 log.info("Annotations 'snpeff'...") 3086 self.annotation_snpeff() 3087 if param.get("annotation", {}).get("exomiser", None) is not None: 3088 log.info("Annotations 'exomiser'...") 3089 self.annotation_exomiser() 3090 if param.get("annotation", {}).get("splice", None) is not None: 3091 log.info("Annotations 'splice' ...") 3092 self.annotation_splice() 3093 3094 # Explode INFOS fields into table fields 3095 if self.get_explode_infos(): 3096 self.explode_infos( 3097 prefix=self.get_explode_infos_prefix(), 3098 fields=self.get_explode_infos_fields(), 3099 force=True, 3100 ) 3101 3102 def annotation_snpsift(self, threads: int = None) -> None: 3103 """ 3104 This function annotate with bcftools 3105 3106 :param threads: Number of threads to use 3107 :return: the value of the variable "return_value". 3108 """ 3109 3110 # DEBUG 3111 log.debug("Start annotation with bcftools databases") 3112 3113 # Threads 3114 if not threads: 3115 threads = self.get_threads() 3116 log.debug("Threads: " + str(threads)) 3117 3118 # Config 3119 config = self.get_config() 3120 log.debug("Config: " + str(config)) 3121 3122 # Config - snpSift 3123 snpsift_bin_command = get_bin_command( 3124 bin="SnpSift.jar", 3125 tool="snpsift", 3126 bin_type="jar", 3127 config=config, 3128 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3129 ) 3130 if not snpsift_bin_command: 3131 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3132 log.error(msg_err) 3133 raise ValueError(msg_err) 3134 3135 # Config - bcftools 3136 bcftools_bin_command = get_bin_command( 3137 bin="bcftools", 3138 tool="bcftools", 3139 bin_type="bin", 3140 config=config, 3141 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3142 ) 3143 if not bcftools_bin_command: 3144 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3145 log.error(msg_err) 3146 raise ValueError(msg_err) 3147 3148 # Config - BCFTools databases folders 3149 databases_folders = set( 3150 self.get_config() 3151 .get("folders", {}) 3152 .get("databases", {}) 3153 .get("annotations", ["."]) 3154 + self.get_config() 3155 .get("folders", {}) 3156 .get("databases", {}) 3157 .get("bcftools", ["."]) 3158 ) 3159 log.debug("Databases annotations: " + str(databases_folders)) 3160 3161 # Param 3162 annotations = ( 3163 self.get_param() 3164 .get("annotation", {}) 3165 .get("snpsift", {}) 3166 .get("annotations", None) 3167 ) 3168 log.debug("Annotations: " + str(annotations)) 3169 3170 # Assembly 3171 assembly = self.get_param().get( 3172 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3173 ) 3174 3175 # Data 3176 table_variants = self.get_table_variants() 3177 3178 # Check if not empty 3179 log.debug("Check if not empty") 3180 sql_query_chromosomes = ( 3181 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3182 ) 3183 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3184 if not sql_query_chromosomes_df["count"][0]: 3185 log.info(f"VCF empty") 3186 return 3187 3188 # VCF header 3189 vcf_reader = self.get_header() 3190 log.debug("Initial header: " + str(vcf_reader.infos)) 3191 3192 # Existing annotations 3193 for vcf_annotation in self.get_header().infos: 3194 3195 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3196 log.debug( 3197 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3198 ) 3199 3200 if annotations: 3201 3202 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3203 3204 # Export VCF file 3205 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3206 3207 # Init 3208 commands = {} 3209 3210 for annotation in annotations: 3211 annotation_fields = annotations[annotation] 3212 3213 # Annotation Name 3214 annotation_name = os.path.basename(annotation) 3215 3216 if not annotation_fields: 3217 annotation_fields = {"INFO": None} 3218 3219 log.debug(f"Annotation '{annotation_name}'") 3220 log.debug( 3221 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3222 ) 3223 3224 # Create Database 3225 database = Database( 3226 database=annotation, 3227 databases_folders=databases_folders, 3228 assembly=assembly, 3229 ) 3230 3231 # Find files 3232 db_file = database.get_database() 3233 db_file = full_path(db_file) 3234 db_hdr_file = database.get_header_file() 3235 db_hdr_file = full_path(db_hdr_file) 3236 db_file_type = database.get_format() 3237 db_tbi_file = f"{db_file}.tbi" 3238 db_file_compressed = database.is_compressed() 3239 3240 # Check if compressed 3241 if not db_file_compressed: 3242 log.error( 3243 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3244 ) 3245 raise ValueError( 3246 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3247 ) 3248 3249 # Check if indexed 3250 if not os.path.exists(db_tbi_file): 3251 log.error( 3252 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3253 ) 3254 raise ValueError( 3255 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3256 ) 3257 3258 # Check index - try to create if not exists 3259 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3260 log.error("Annotation failed: database not valid") 3261 log.error(f"Annotation annotation file: {db_file}") 3262 log.error(f"Annotation annotation header: {db_hdr_file}") 3263 log.error(f"Annotation annotation index: {db_tbi_file}") 3264 raise ValueError( 3265 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3266 ) 3267 else: 3268 3269 log.debug( 3270 f"Annotation '{annotation}' - file: " 3271 + str(db_file) 3272 + " and " 3273 + str(db_hdr_file) 3274 ) 3275 3276 # Load header as VCF object 3277 db_hdr_vcf = Variants(input=db_hdr_file) 3278 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3279 log.debug( 3280 "Annotation database header: " 3281 + str(db_hdr_vcf_header_infos) 3282 ) 3283 3284 # For all fields in database 3285 annotation_fields_full = False 3286 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3287 annotation_fields = { 3288 key: key for key in db_hdr_vcf_header_infos 3289 } 3290 log.debug( 3291 "Annotation database header - All annotations added: " 3292 + str(annotation_fields) 3293 ) 3294 annotation_fields_full = True 3295 3296 # # Create file for field rename 3297 # log.debug("Create file for field rename") 3298 # tmp_rename = NamedTemporaryFile( 3299 # prefix=self.get_prefix(), 3300 # dir=self.get_tmp_dir(), 3301 # suffix=".rename", 3302 # delete=False, 3303 # ) 3304 # tmp_rename_name = tmp_rename.name 3305 # tmp_files.append(tmp_rename_name) 3306 3307 # Number of fields 3308 nb_annotation_field = 0 3309 annotation_list = [] 3310 annotation_infos_rename_list = [] 3311 3312 for annotation_field in annotation_fields: 3313 3314 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3315 annotation_fields_new_name = annotation_fields.get( 3316 annotation_field, annotation_field 3317 ) 3318 if not annotation_fields_new_name: 3319 annotation_fields_new_name = annotation_field 3320 3321 # Check if field is in DB and if field is not elready in input data 3322 if ( 3323 annotation_field in db_hdr_vcf.get_header().infos 3324 and annotation_fields_new_name 3325 not in self.get_header().infos 3326 ): 3327 3328 log.info( 3329 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3330 ) 3331 3332 # BCFTools annotate param to rename fields 3333 if annotation_field != annotation_fields_new_name: 3334 annotation_infos_rename_list.append( 3335 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3336 ) 3337 3338 # Add INFO field to header 3339 db_hdr_vcf_header_infos_number = ( 3340 db_hdr_vcf_header_infos[annotation_field].num or "." 3341 ) 3342 db_hdr_vcf_header_infos_type = ( 3343 db_hdr_vcf_header_infos[annotation_field].type 3344 or "String" 3345 ) 3346 db_hdr_vcf_header_infos_description = ( 3347 db_hdr_vcf_header_infos[annotation_field].desc 3348 or f"{annotation_field} description" 3349 ) 3350 db_hdr_vcf_header_infos_source = ( 3351 db_hdr_vcf_header_infos[annotation_field].source 3352 or "unknown" 3353 ) 3354 db_hdr_vcf_header_infos_version = ( 3355 db_hdr_vcf_header_infos[annotation_field].version 3356 or "unknown" 3357 ) 3358 3359 vcf_reader.infos[annotation_fields_new_name] = ( 3360 vcf.parser._Info( 3361 annotation_fields_new_name, 3362 db_hdr_vcf_header_infos_number, 3363 db_hdr_vcf_header_infos_type, 3364 db_hdr_vcf_header_infos_description, 3365 db_hdr_vcf_header_infos_source, 3366 db_hdr_vcf_header_infos_version, 3367 self.code_type_map[ 3368 db_hdr_vcf_header_infos_type 3369 ], 3370 ) 3371 ) 3372 3373 annotation_list.append(annotation_field) 3374 3375 nb_annotation_field += 1 3376 3377 else: 3378 3379 if ( 3380 annotation_field 3381 not in db_hdr_vcf.get_header().infos 3382 ): 3383 log.warning( 3384 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3385 ) 3386 if ( 3387 annotation_fields_new_name 3388 in self.get_header().infos 3389 ): 3390 log.warning( 3391 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3392 ) 3393 3394 log.info( 3395 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3396 ) 3397 3398 annotation_infos = ",".join(annotation_list) 3399 3400 if annotation_infos != "": 3401 3402 # Annotated VCF (and error file) 3403 tmp_annotation_vcf_name = os.path.join( 3404 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3405 ) 3406 tmp_annotation_vcf_name_err = ( 3407 tmp_annotation_vcf_name + ".err" 3408 ) 3409 3410 # Add fields to annotate 3411 if not annotation_fields_full: 3412 annotation_infos_option = f"-info {annotation_infos}" 3413 else: 3414 annotation_infos_option = "" 3415 3416 # Info fields rename 3417 if annotation_infos_rename_list: 3418 annotation_infos_rename = " -c " + ",".join( 3419 annotation_infos_rename_list 3420 ) 3421 else: 3422 annotation_infos_rename = "" 3423 3424 # Annotate command 3425 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3426 3427 # Add command 3428 commands[command_annotate] = tmp_annotation_vcf_name 3429 3430 if commands: 3431 3432 # Export VCF file 3433 self.export_variant_vcf( 3434 vcf_file=tmp_vcf_name, 3435 remove_info=True, 3436 add_samples=False, 3437 index=True, 3438 ) 3439 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3440 3441 # Num command 3442 nb_command = 0 3443 3444 # Annotate 3445 for command_annotate in commands: 3446 nb_command += 1 3447 log.info( 3448 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3449 ) 3450 log.debug(f"command_annotate={command_annotate}") 3451 run_parallel_commands([command_annotate], threads) 3452 3453 # Debug 3454 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3455 3456 # Update variants 3457 log.info( 3458 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3459 ) 3460 self.update_from_vcf(commands[command_annotate]) 3461 3462 def annotation_bcftools(self, threads: int = None) -> None: 3463 """ 3464 This function annotate with bcftools 3465 3466 :param threads: Number of threads to use 3467 :return: the value of the variable "return_value". 3468 """ 3469 3470 # DEBUG 3471 log.debug("Start annotation with bcftools databases") 3472 3473 # Threads 3474 if not threads: 3475 threads = self.get_threads() 3476 log.debug("Threads: " + str(threads)) 3477 3478 # Config 3479 config = self.get_config() 3480 log.debug("Config: " + str(config)) 3481 3482 # DEBUG 3483 delete_tmp = True 3484 if self.get_config().get("verbosity", "warning") in ["debug"]: 3485 delete_tmp = False 3486 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3487 3488 # Config - BCFTools bin command 3489 bcftools_bin_command = get_bin_command( 3490 bin="bcftools", 3491 tool="bcftools", 3492 bin_type="bin", 3493 config=config, 3494 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3495 ) 3496 if not bcftools_bin_command: 3497 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3498 log.error(msg_err) 3499 raise ValueError(msg_err) 3500 3501 # Config - BCFTools databases folders 3502 databases_folders = set( 3503 self.get_config() 3504 .get("folders", {}) 3505 .get("databases", {}) 3506 .get("annotations", ["."]) 3507 + self.get_config() 3508 .get("folders", {}) 3509 .get("databases", {}) 3510 .get("bcftools", ["."]) 3511 ) 3512 log.debug("Databases annotations: " + str(databases_folders)) 3513 3514 # Param 3515 annotations = ( 3516 self.get_param() 3517 .get("annotation", {}) 3518 .get("bcftools", {}) 3519 .get("annotations", None) 3520 ) 3521 log.debug("Annotations: " + str(annotations)) 3522 3523 # Assembly 3524 assembly = self.get_param().get( 3525 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3526 ) 3527 3528 # Data 3529 table_variants = self.get_table_variants() 3530 3531 # Check if not empty 3532 log.debug("Check if not empty") 3533 sql_query_chromosomes = ( 3534 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3535 ) 3536 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3537 if not sql_query_chromosomes_df["count"][0]: 3538 log.info(f"VCF empty") 3539 return 3540 3541 # Export in VCF 3542 log.debug("Create initial file to annotate") 3543 tmp_vcf = NamedTemporaryFile( 3544 prefix=self.get_prefix(), 3545 dir=self.get_tmp_dir(), 3546 suffix=".vcf.gz", 3547 delete=False, 3548 ) 3549 tmp_vcf_name = tmp_vcf.name 3550 3551 # VCF header 3552 vcf_reader = self.get_header() 3553 log.debug("Initial header: " + str(vcf_reader.infos)) 3554 3555 # Existing annotations 3556 for vcf_annotation in self.get_header().infos: 3557 3558 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3559 log.debug( 3560 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3561 ) 3562 3563 if annotations: 3564 3565 tmp_ann_vcf_list = [] 3566 commands = [] 3567 tmp_files = [] 3568 err_files = [] 3569 3570 for annotation in annotations: 3571 annotation_fields = annotations[annotation] 3572 3573 # Annotation Name 3574 annotation_name = os.path.basename(annotation) 3575 3576 if not annotation_fields: 3577 annotation_fields = {"INFO": None} 3578 3579 log.debug(f"Annotation '{annotation_name}'") 3580 log.debug( 3581 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3582 ) 3583 3584 # Create Database 3585 database = Database( 3586 database=annotation, 3587 databases_folders=databases_folders, 3588 assembly=assembly, 3589 ) 3590 3591 # Find files 3592 db_file = database.get_database() 3593 db_file = full_path(db_file) 3594 db_hdr_file = database.get_header_file() 3595 db_hdr_file = full_path(db_hdr_file) 3596 db_file_type = database.get_format() 3597 db_tbi_file = f"{db_file}.tbi" 3598 db_file_compressed = database.is_compressed() 3599 3600 # Check if compressed 3601 if not db_file_compressed: 3602 log.error( 3603 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3604 ) 3605 raise ValueError( 3606 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3607 ) 3608 3609 # Check if indexed 3610 if not os.path.exists(db_tbi_file): 3611 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3612 raise ValueError( 3613 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3614 ) 3615 3616 # Check index - try to create if not exists 3617 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3618 log.error("Annotation failed: database not valid") 3619 log.error(f"Annotation annotation file: {db_file}") 3620 log.error(f"Annotation annotation header: {db_hdr_file}") 3621 log.error(f"Annotation annotation index: {db_tbi_file}") 3622 raise ValueError( 3623 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3624 ) 3625 else: 3626 3627 log.debug( 3628 f"Annotation '{annotation}' - file: " 3629 + str(db_file) 3630 + " and " 3631 + str(db_hdr_file) 3632 ) 3633 3634 # Load header as VCF object 3635 db_hdr_vcf = Variants(input=db_hdr_file) 3636 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3637 log.debug( 3638 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3639 ) 3640 3641 # For all fields in database 3642 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3643 annotation_fields = { 3644 key: key for key in db_hdr_vcf_header_infos 3645 } 3646 log.debug( 3647 "Annotation database header - All annotations added: " 3648 + str(annotation_fields) 3649 ) 3650 3651 # Number of fields 3652 nb_annotation_field = 0 3653 annotation_list = [] 3654 3655 for annotation_field in annotation_fields: 3656 3657 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3658 annotation_fields_new_name = annotation_fields.get( 3659 annotation_field, annotation_field 3660 ) 3661 if not annotation_fields_new_name: 3662 annotation_fields_new_name = annotation_field 3663 3664 # Check if field is in DB and if field is not elready in input data 3665 if ( 3666 annotation_field in db_hdr_vcf.get_header().infos 3667 and annotation_fields_new_name 3668 not in self.get_header().infos 3669 ): 3670 3671 log.info( 3672 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3673 ) 3674 3675 # Add INFO field to header 3676 db_hdr_vcf_header_infos_number = ( 3677 db_hdr_vcf_header_infos[annotation_field].num or "." 3678 ) 3679 db_hdr_vcf_header_infos_type = ( 3680 db_hdr_vcf_header_infos[annotation_field].type 3681 or "String" 3682 ) 3683 db_hdr_vcf_header_infos_description = ( 3684 db_hdr_vcf_header_infos[annotation_field].desc 3685 or f"{annotation_field} description" 3686 ) 3687 db_hdr_vcf_header_infos_source = ( 3688 db_hdr_vcf_header_infos[annotation_field].source 3689 or "unknown" 3690 ) 3691 db_hdr_vcf_header_infos_version = ( 3692 db_hdr_vcf_header_infos[annotation_field].version 3693 or "unknown" 3694 ) 3695 3696 vcf_reader.infos[annotation_fields_new_name] = ( 3697 vcf.parser._Info( 3698 annotation_fields_new_name, 3699 db_hdr_vcf_header_infos_number, 3700 db_hdr_vcf_header_infos_type, 3701 db_hdr_vcf_header_infos_description, 3702 db_hdr_vcf_header_infos_source, 3703 db_hdr_vcf_header_infos_version, 3704 self.code_type_map[db_hdr_vcf_header_infos_type], 3705 ) 3706 ) 3707 3708 # annotation_list.append(annotation_field) 3709 if annotation_field != annotation_fields_new_name: 3710 annotation_list.append( 3711 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3712 ) 3713 else: 3714 annotation_list.append(annotation_field) 3715 3716 nb_annotation_field += 1 3717 3718 else: 3719 3720 if annotation_field not in db_hdr_vcf.get_header().infos: 3721 log.warning( 3722 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3723 ) 3724 if annotation_fields_new_name in self.get_header().infos: 3725 log.warning( 3726 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3727 ) 3728 3729 log.info( 3730 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3731 ) 3732 3733 annotation_infos = ",".join(annotation_list) 3734 3735 if annotation_infos != "": 3736 3737 # Protect header for bcftools (remove "#CHROM" and variants line) 3738 log.debug("Protect Header file - remove #CHROM line if exists") 3739 tmp_header_vcf = NamedTemporaryFile( 3740 prefix=self.get_prefix(), 3741 dir=self.get_tmp_dir(), 3742 suffix=".hdr", 3743 delete=False, 3744 ) 3745 tmp_header_vcf_name = tmp_header_vcf.name 3746 tmp_files.append(tmp_header_vcf_name) 3747 # Command 3748 if db_hdr_file.endswith(".gz"): 3749 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3750 else: 3751 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3752 # Run 3753 run_parallel_commands([command_extract_header], 1) 3754 3755 # Find chomosomes 3756 log.debug("Find chromosomes ") 3757 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3758 sql_query_chromosomes_df = self.get_query_to_df( 3759 sql_query_chromosomes 3760 ) 3761 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3762 3763 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3764 3765 # BED columns in the annotation file 3766 if db_file_type in ["bed"]: 3767 annotation_infos = "CHROM,POS,POS," + annotation_infos 3768 3769 for chrom in chomosomes_list: 3770 3771 # Create BED on initial VCF 3772 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3773 tmp_bed = NamedTemporaryFile( 3774 prefix=self.get_prefix(), 3775 dir=self.get_tmp_dir(), 3776 suffix=".bed", 3777 delete=False, 3778 ) 3779 tmp_bed_name = tmp_bed.name 3780 tmp_files.append(tmp_bed_name) 3781 3782 # Detecte regions 3783 log.debug( 3784 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3785 ) 3786 window = 1000000 3787 sql_query_intervals_for_bed = f""" 3788 SELECT \"#CHROM\", 3789 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3790 \"POS\"+{window} 3791 FROM {table_variants} as table_variants 3792 WHERE table_variants.\"#CHROM\" = '{chrom}' 3793 """ 3794 regions = self.conn.execute( 3795 sql_query_intervals_for_bed 3796 ).fetchall() 3797 merged_regions = merge_regions(regions) 3798 log.debug( 3799 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3800 ) 3801 3802 header = ["#CHROM", "START", "END"] 3803 with open(tmp_bed_name, "w") as f: 3804 # Write the header with tab delimiter 3805 f.write("\t".join(header) + "\n") 3806 for d in merged_regions: 3807 # Write each data row with tab delimiter 3808 f.write("\t".join(map(str, d)) + "\n") 3809 3810 # Tmp files 3811 tmp_annotation_vcf = NamedTemporaryFile( 3812 prefix=self.get_prefix(), 3813 dir=self.get_tmp_dir(), 3814 suffix=".vcf.gz", 3815 delete=False, 3816 ) 3817 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3818 tmp_files.append(tmp_annotation_vcf_name) 3819 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3820 tmp_annotation_vcf_name_err = ( 3821 tmp_annotation_vcf_name + ".err" 3822 ) 3823 err_files.append(tmp_annotation_vcf_name_err) 3824 3825 # Annotate Command 3826 log.debug( 3827 f"Annotation '{annotation}' - add bcftools command" 3828 ) 3829 3830 # Command 3831 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3832 3833 # Add command 3834 commands.append(command_annotate) 3835 3836 # if some commands 3837 if commands: 3838 3839 # Export VCF file 3840 self.export_variant_vcf( 3841 vcf_file=tmp_vcf_name, 3842 remove_info=True, 3843 add_samples=False, 3844 index=True, 3845 ) 3846 3847 # Threads 3848 # calculate threads for annotated commands 3849 if commands: 3850 threads_bcftools_annotate = round(threads / len(commands)) 3851 else: 3852 threads_bcftools_annotate = 1 3853 3854 if not threads_bcftools_annotate: 3855 threads_bcftools_annotate = 1 3856 3857 # Add threads option to bcftools commands 3858 if threads_bcftools_annotate > 1: 3859 commands_threaded = [] 3860 for command in commands: 3861 commands_threaded.append( 3862 command.replace( 3863 f"{bcftools_bin_command} annotate ", 3864 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3865 ) 3866 ) 3867 commands = commands_threaded 3868 3869 # Command annotation multithreading 3870 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3871 log.info( 3872 f"Annotation - Annotation multithreaded in " 3873 + str(len(commands)) 3874 + " commands" 3875 ) 3876 3877 run_parallel_commands(commands, threads) 3878 3879 # Merge 3880 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3881 3882 if tmp_ann_vcf_list_cmd: 3883 3884 # Tmp file 3885 tmp_annotate_vcf = NamedTemporaryFile( 3886 prefix=self.get_prefix(), 3887 dir=self.get_tmp_dir(), 3888 suffix=".vcf.gz", 3889 delete=True, 3890 ) 3891 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3892 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3893 err_files.append(tmp_annotate_vcf_name_err) 3894 3895 # Tmp file remove command 3896 tmp_files_remove_command = "" 3897 if tmp_files: 3898 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3899 3900 # Command merge 3901 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3902 log.info( 3903 f"Annotation - Annotation merging " 3904 + str(len(commands)) 3905 + " annotated files" 3906 ) 3907 log.debug(f"Annotation - merge command: {merge_command}") 3908 run_parallel_commands([merge_command], 1) 3909 3910 # Error messages 3911 log.info(f"Error/Warning messages:") 3912 error_message_command_all = [] 3913 error_message_command_warning = [] 3914 error_message_command_err = [] 3915 for err_file in err_files: 3916 with open(err_file, "r") as f: 3917 for line in f: 3918 message = line.strip() 3919 error_message_command_all.append(message) 3920 if line.startswith("[W::"): 3921 error_message_command_warning.append(message) 3922 if line.startswith("[E::"): 3923 error_message_command_err.append( 3924 f"{err_file}: " + message 3925 ) 3926 # log info 3927 for message in list( 3928 set(error_message_command_err + error_message_command_warning) 3929 ): 3930 log.info(f" {message}") 3931 # debug info 3932 for message in list(set(error_message_command_all)): 3933 log.debug(f" {message}") 3934 # failed 3935 if len(error_message_command_err): 3936 log.error("Annotation failed: Error in commands") 3937 raise ValueError("Annotation failed: Error in commands") 3938 3939 # Update variants 3940 log.info(f"Annotation - Updating...") 3941 self.update_from_vcf(tmp_annotate_vcf_name) 3942 3943 def annotation_exomiser(self, threads: int = None) -> None: 3944 """ 3945 This function annotate with Exomiser 3946 3947 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3948 - "analysis" (dict/file): 3949 Full analysis dictionnary parameters (see Exomiser docs). 3950 Either a dict, or a file in JSON or YAML format. 3951 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3952 Default : None 3953 - "preset" (string): 3954 Analysis preset (available in config folder). 3955 Used if no full "analysis" is provided. 3956 Default: "exome" 3957 - "phenopacket" (dict/file): 3958 Samples and phenotipic features parameters (see Exomiser docs). 3959 Either a dict, or a file in JSON or YAML format. 3960 Default: None 3961 - "subject" (dict): 3962 Sample parameters (see Exomiser docs). 3963 Example: 3964 "subject": 3965 { 3966 "id": "ISDBM322017", 3967 "sex": "FEMALE" 3968 } 3969 Default: None 3970 - "sample" (string): 3971 Sample name to construct "subject" section: 3972 "subject": 3973 { 3974 "id": "<sample>", 3975 "sex": "UNKNOWN_SEX" 3976 } 3977 Default: None 3978 - "phenotypicFeatures" (dict) 3979 Phenotypic features to construct "subject" section. 3980 Example: 3981 "phenotypicFeatures": 3982 [ 3983 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3984 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3985 ] 3986 - "hpo" (list) 3987 List of HPO ids as phenotypic features. 3988 Example: 3989 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3990 Default: [] 3991 - "outputOptions" (dict): 3992 Output options (see Exomiser docs). 3993 Default: 3994 "output_options" = 3995 { 3996 "outputContributingVariantsOnly": False, 3997 "numGenes": 0, 3998 "outputFormats": ["TSV_VARIANT", "VCF"] 3999 } 4000 - "transcript_source" (string): 4001 Transcript source (either "refseq", "ucsc", "ensembl") 4002 Default: "refseq" 4003 - "exomiser_to_info" (boolean): 4004 Add exomiser TSV file columns as INFO fields in VCF. 4005 Default: False 4006 - "release" (string): 4007 Exomise database release. 4008 If not exists, database release will be downloaded (take a while). 4009 Default: None (provided by application.properties configuration file) 4010 - "exomiser_application_properties" (file): 4011 Exomiser configuration file (see Exomiser docs). 4012 Useful to automatically download databases (especially for specific genome databases). 4013 4014 Notes: 4015 - If no sample in parameters, first sample in VCF will be chosen 4016 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4017 4018 :param threads: The number of threads to use 4019 :return: None. 4020 """ 4021 4022 # DEBUG 4023 log.debug("Start annotation with Exomiser databases") 4024 4025 # Threads 4026 if not threads: 4027 threads = self.get_threads() 4028 log.debug("Threads: " + str(threads)) 4029 4030 # Config 4031 config = self.get_config() 4032 log.debug("Config: " + str(config)) 4033 4034 # Config - Folders - Databases 4035 databases_folders = ( 4036 config.get("folders", {}) 4037 .get("databases", {}) 4038 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4039 ) 4040 databases_folders = full_path(databases_folders) 4041 if not os.path.exists(databases_folders): 4042 log.error(f"Databases annotations: {databases_folders} NOT found") 4043 log.debug("Databases annotations: " + str(databases_folders)) 4044 4045 # Config - Exomiser 4046 exomiser_bin_command = get_bin_command( 4047 bin="exomiser-cli*.jar", 4048 tool="exomiser", 4049 bin_type="jar", 4050 config=config, 4051 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4052 ) 4053 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4054 if not exomiser_bin_command: 4055 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4056 log.error(msg_err) 4057 raise ValueError(msg_err) 4058 4059 # Param 4060 param = self.get_param() 4061 log.debug("Param: " + str(param)) 4062 4063 # Param - Exomiser 4064 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4065 log.debug(f"Param Exomiser: {param_exomiser}") 4066 4067 # Param - Assembly 4068 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4069 log.debug("Assembly: " + str(assembly)) 4070 4071 # Data 4072 table_variants = self.get_table_variants() 4073 4074 # Check if not empty 4075 log.debug("Check if not empty") 4076 sql_query_chromosomes = ( 4077 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4078 ) 4079 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4080 log.info(f"VCF empty") 4081 return False 4082 4083 # VCF header 4084 vcf_reader = self.get_header() 4085 log.debug("Initial header: " + str(vcf_reader.infos)) 4086 4087 # Samples 4088 samples = self.get_header_sample_list() 4089 if not samples: 4090 log.error("No Samples in VCF") 4091 return False 4092 log.debug(f"Samples: {samples}") 4093 4094 # Memory limit 4095 memory_limit = self.get_memory("8G") 4096 log.debug(f"memory_limit: {memory_limit}") 4097 4098 # Exomiser java options 4099 exomiser_java_options = ( 4100 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4101 ) 4102 log.debug(f"Exomiser java options: {exomiser_java_options}") 4103 4104 # Download Exomiser (if not exists) 4105 exomiser_release = param_exomiser.get("release", None) 4106 exomiser_application_properties = param_exomiser.get( 4107 "exomiser_application_properties", None 4108 ) 4109 databases_download_exomiser( 4110 assemblies=[assembly], 4111 exomiser_folder=databases_folders, 4112 exomiser_release=exomiser_release, 4113 exomiser_phenotype_release=exomiser_release, 4114 exomiser_application_properties=exomiser_application_properties, 4115 ) 4116 4117 # Force annotation 4118 force_update_annotation = True 4119 4120 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4121 log.debug("Start annotation Exomiser") 4122 4123 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4124 4125 # tmp_dir = "/tmp/exomiser" 4126 4127 ### ANALYSIS ### 4128 ################ 4129 4130 # Create analysis.json through analysis dict 4131 # either analysis in param or by default 4132 # depending on preset exome/genome) 4133 4134 # Init analysis dict 4135 param_exomiser_analysis_dict = {} 4136 4137 # analysis from param 4138 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4139 param_exomiser_analysis = full_path(param_exomiser_analysis) 4140 4141 # If analysis in param -> load anlaysis json 4142 if param_exomiser_analysis: 4143 4144 # If param analysis is a file and exists 4145 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4146 param_exomiser_analysis 4147 ): 4148 # Load analysis file into analysis dict (either yaml or json) 4149 with open(param_exomiser_analysis) as json_file: 4150 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4151 4152 # If param analysis is a dict 4153 elif isinstance(param_exomiser_analysis, dict): 4154 # Load analysis dict into analysis dict (either yaml or json) 4155 param_exomiser_analysis_dict = param_exomiser_analysis 4156 4157 # Error analysis type 4158 else: 4159 log.error(f"Analysis type unknown. Check param file.") 4160 raise ValueError(f"Analysis type unknown. Check param file.") 4161 4162 # Case no input analysis config file/dict 4163 # Use preset (exome/genome) to open default config file 4164 if not param_exomiser_analysis_dict: 4165 4166 # default preset 4167 default_preset = "exome" 4168 4169 # Get param preset or default preset 4170 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4171 4172 # Try to find if preset is a file 4173 if os.path.exists(param_exomiser_preset): 4174 # Preset file is provided in full path 4175 param_exomiser_analysis_default_config_file = ( 4176 param_exomiser_preset 4177 ) 4178 # elif os.path.exists(full_path(param_exomiser_preset)): 4179 # # Preset file is provided in full path 4180 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4181 elif os.path.exists( 4182 os.path.join(folder_config, param_exomiser_preset) 4183 ): 4184 # Preset file is provided a basename in config folder (can be a path with subfolders) 4185 param_exomiser_analysis_default_config_file = os.path.join( 4186 folder_config, param_exomiser_preset 4187 ) 4188 else: 4189 # Construct preset file 4190 param_exomiser_analysis_default_config_file = os.path.join( 4191 folder_config, 4192 f"preset-{param_exomiser_preset}-analysis.json", 4193 ) 4194 4195 # If preset file exists 4196 param_exomiser_analysis_default_config_file = full_path( 4197 param_exomiser_analysis_default_config_file 4198 ) 4199 if os.path.exists(param_exomiser_analysis_default_config_file): 4200 # Load prest file into analysis dict (either yaml or json) 4201 with open( 4202 param_exomiser_analysis_default_config_file 4203 ) as json_file: 4204 # param_exomiser_analysis_dict[""] = json.load(json_file) 4205 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4206 json_file 4207 ) 4208 4209 # Error preset file 4210 else: 4211 log.error( 4212 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4213 ) 4214 raise ValueError( 4215 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4216 ) 4217 4218 # If no analysis dict created 4219 if not param_exomiser_analysis_dict: 4220 log.error(f"No analysis config") 4221 raise ValueError(f"No analysis config") 4222 4223 # Log 4224 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4225 4226 ### PHENOPACKET ### 4227 ################### 4228 4229 # If no PhenoPacket in analysis dict -> check in param 4230 if "phenopacket" not in param_exomiser_analysis_dict: 4231 4232 # If PhenoPacket in param -> load anlaysis json 4233 if param_exomiser.get("phenopacket", None): 4234 4235 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4236 param_exomiser_phenopacket = full_path( 4237 param_exomiser_phenopacket 4238 ) 4239 4240 # If param phenopacket is a file and exists 4241 if isinstance( 4242 param_exomiser_phenopacket, str 4243 ) and os.path.exists(param_exomiser_phenopacket): 4244 # Load phenopacket file into analysis dict (either yaml or json) 4245 with open(param_exomiser_phenopacket) as json_file: 4246 param_exomiser_analysis_dict["phenopacket"] = ( 4247 yaml.safe_load(json_file) 4248 ) 4249 4250 # If param phenopacket is a dict 4251 elif isinstance(param_exomiser_phenopacket, dict): 4252 # Load phenopacket dict into analysis dict (either yaml or json) 4253 param_exomiser_analysis_dict["phenopacket"] = ( 4254 param_exomiser_phenopacket 4255 ) 4256 4257 # Error phenopacket type 4258 else: 4259 log.error(f"Phenopacket type unknown. Check param file.") 4260 raise ValueError( 4261 f"Phenopacket type unknown. Check param file." 4262 ) 4263 4264 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4265 if "phenopacket" not in param_exomiser_analysis_dict: 4266 4267 # Init PhenoPacket 4268 param_exomiser_analysis_dict["phenopacket"] = { 4269 "id": "analysis", 4270 "proband": {}, 4271 } 4272 4273 ### Add subject ### 4274 4275 # If subject exists 4276 param_exomiser_subject = param_exomiser.get("subject", {}) 4277 4278 # If subject not exists -> found sample ID 4279 if not param_exomiser_subject: 4280 4281 # Found sample ID in param 4282 sample = param_exomiser.get("sample", None) 4283 4284 # Find sample ID (first sample) 4285 if not sample: 4286 sample_list = self.get_header_sample_list() 4287 if len(sample_list) > 0: 4288 sample = sample_list[0] 4289 else: 4290 log.error(f"No sample found") 4291 raise ValueError(f"No sample found") 4292 4293 # Create subject 4294 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4295 4296 # Add to dict 4297 param_exomiser_analysis_dict["phenopacket"][ 4298 "subject" 4299 ] = param_exomiser_subject 4300 4301 ### Add "phenotypicFeatures" ### 4302 4303 # If phenotypicFeatures exists 4304 param_exomiser_phenotypicfeatures = param_exomiser.get( 4305 "phenotypicFeatures", [] 4306 ) 4307 4308 # If phenotypicFeatures not exists -> Try to infer from hpo list 4309 if not param_exomiser_phenotypicfeatures: 4310 4311 # Found HPO in param 4312 param_exomiser_hpo = param_exomiser.get("hpo", []) 4313 4314 # Split HPO if list in string format separated by comma 4315 if isinstance(param_exomiser_hpo, str): 4316 param_exomiser_hpo = param_exomiser_hpo.split(",") 4317 4318 # Create HPO list 4319 for hpo in param_exomiser_hpo: 4320 hpo_clean = re.sub("[^0-9]", "", hpo) 4321 param_exomiser_phenotypicfeatures.append( 4322 { 4323 "type": { 4324 "id": f"HP:{hpo_clean}", 4325 "label": f"HP:{hpo_clean}", 4326 } 4327 } 4328 ) 4329 4330 # Add to dict 4331 param_exomiser_analysis_dict["phenopacket"][ 4332 "phenotypicFeatures" 4333 ] = param_exomiser_phenotypicfeatures 4334 4335 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4336 if not param_exomiser_phenotypicfeatures: 4337 for step in param_exomiser_analysis_dict.get( 4338 "analysis", {} 4339 ).get("steps", []): 4340 if "hiPhivePrioritiser" in step: 4341 param_exomiser_analysis_dict.get("analysis", {}).get( 4342 "steps", [] 4343 ).remove(step) 4344 4345 ### Add Input File ### 4346 4347 # Initial file name and htsFiles 4348 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4349 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4350 { 4351 "uri": tmp_vcf_name, 4352 "htsFormat": "VCF", 4353 "genomeAssembly": assembly, 4354 } 4355 ] 4356 4357 ### Add metaData ### 4358 4359 # If metaData not in analysis dict 4360 if "metaData" not in param_exomiser_analysis_dict: 4361 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4362 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4363 "createdBy": "howard", 4364 "phenopacketSchemaVersion": 1, 4365 } 4366 4367 ### OutputOptions ### 4368 4369 # Init output result folder 4370 output_results = os.path.join(tmp_dir, "results") 4371 4372 # If no outputOptions in analysis dict 4373 if "outputOptions" not in param_exomiser_analysis_dict: 4374 4375 # default output formats 4376 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4377 4378 # Get outputOptions in param 4379 output_options = param_exomiser.get("outputOptions", None) 4380 4381 # If no output_options in param -> check 4382 if not output_options: 4383 output_options = { 4384 "outputContributingVariantsOnly": False, 4385 "numGenes": 0, 4386 "outputFormats": defaut_output_formats, 4387 } 4388 4389 # Replace outputDirectory in output options 4390 output_options["outputDirectory"] = output_results 4391 output_options["outputFileName"] = "howard" 4392 4393 # Add outputOptions in analysis dict 4394 param_exomiser_analysis_dict["outputOptions"] = output_options 4395 4396 else: 4397 4398 # Replace output_results and output format (if exists in param) 4399 param_exomiser_analysis_dict["outputOptions"][ 4400 "outputDirectory" 4401 ] = output_results 4402 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4403 list( 4404 set( 4405 param_exomiser_analysis_dict.get( 4406 "outputOptions", {} 4407 ).get("outputFormats", []) 4408 + ["TSV_VARIANT", "VCF"] 4409 ) 4410 ) 4411 ) 4412 4413 # log 4414 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4415 4416 ### ANALYSIS FILE ### 4417 ##################### 4418 4419 ### Full JSON analysis config file ### 4420 4421 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4422 with open(exomiser_analysis, "w") as fp: 4423 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4424 4425 ### SPLIT analysis and sample config files 4426 4427 # Splitted analysis dict 4428 param_exomiser_analysis_dict_for_split = ( 4429 param_exomiser_analysis_dict.copy() 4430 ) 4431 4432 # Phenopacket JSON file 4433 exomiser_analysis_phenopacket = os.path.join( 4434 tmp_dir, "analysis_phenopacket.json" 4435 ) 4436 with open(exomiser_analysis_phenopacket, "w") as fp: 4437 json.dump( 4438 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4439 fp, 4440 indent=4, 4441 ) 4442 4443 # Analysis JSON file without Phenopacket parameters 4444 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4445 exomiser_analysis_analysis = os.path.join( 4446 tmp_dir, "analysis_analysis.json" 4447 ) 4448 with open(exomiser_analysis_analysis, "w") as fp: 4449 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4450 4451 ### INITAL VCF file ### 4452 ####################### 4453 4454 ### Create list of samples to use and include inti initial VCF file #### 4455 4456 # Subject (main sample) 4457 # Get sample ID in analysis dict 4458 sample_subject = ( 4459 param_exomiser_analysis_dict.get("phenopacket", {}) 4460 .get("subject", {}) 4461 .get("id", None) 4462 ) 4463 sample_proband = ( 4464 param_exomiser_analysis_dict.get("phenopacket", {}) 4465 .get("proband", {}) 4466 .get("subject", {}) 4467 .get("id", None) 4468 ) 4469 sample = [] 4470 if sample_subject: 4471 sample.append(sample_subject) 4472 if sample_proband: 4473 sample.append(sample_proband) 4474 4475 # Get sample ID within Pedigree 4476 pedigree_persons_list = ( 4477 param_exomiser_analysis_dict.get("phenopacket", {}) 4478 .get("pedigree", {}) 4479 .get("persons", {}) 4480 ) 4481 4482 # Create list with all sample ID in pedigree (if exists) 4483 pedigree_persons = [] 4484 for person in pedigree_persons_list: 4485 pedigree_persons.append(person.get("individualId")) 4486 4487 # Concat subject sample ID and samples ID in pedigreesamples 4488 samples = list(set(sample + pedigree_persons)) 4489 4490 # Check if sample list is not empty 4491 if not samples: 4492 log.error(f"No samples found") 4493 raise ValueError(f"No samples found") 4494 4495 # Create VCF with sample (either sample in param or first one by default) 4496 # Export VCF file 4497 self.export_variant_vcf( 4498 vcf_file=tmp_vcf_name, 4499 remove_info=True, 4500 add_samples=True, 4501 list_samples=samples, 4502 index=False, 4503 ) 4504 4505 ### Execute Exomiser ### 4506 ######################## 4507 4508 # Init command 4509 exomiser_command = "" 4510 4511 # Command exomiser options 4512 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4513 4514 # Release 4515 exomiser_release = param_exomiser.get("release", None) 4516 if exomiser_release: 4517 # phenotype data version 4518 exomiser_options += ( 4519 f" --exomiser.phenotype.data-version={exomiser_release} " 4520 ) 4521 # data version 4522 exomiser_options += ( 4523 f" --exomiser.{assembly}.data-version={exomiser_release} " 4524 ) 4525 # variant white list 4526 variant_white_list_file = ( 4527 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4528 ) 4529 if os.path.exists( 4530 os.path.join( 4531 databases_folders, assembly, variant_white_list_file 4532 ) 4533 ): 4534 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4535 4536 # transcript_source 4537 transcript_source = param_exomiser.get( 4538 "transcript_source", None 4539 ) # ucsc, refseq, ensembl 4540 if transcript_source: 4541 exomiser_options += ( 4542 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4543 ) 4544 4545 # If analysis contain proband param 4546 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4547 "proband", {} 4548 ): 4549 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4550 4551 # If no proband (usually uniq sample) 4552 else: 4553 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4554 4555 # Log 4556 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4557 4558 # Run command 4559 result = subprocess.call( 4560 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4561 ) 4562 if result: 4563 log.error("Exomiser command failed") 4564 raise ValueError("Exomiser command failed") 4565 4566 ### RESULTS ### 4567 ############### 4568 4569 ### Annotate with TSV fields ### 4570 4571 # Init result tsv file 4572 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4573 4574 # Init result tsv file 4575 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4576 4577 # Parse TSV file and explode columns in INFO field 4578 if exomiser_to_info and os.path.exists(output_results_tsv): 4579 4580 # Log 4581 log.debug("Exomiser columns to VCF INFO field") 4582 4583 # Retrieve columns and types 4584 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4585 output_results_tsv_df = self.get_query_to_df(query) 4586 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4587 4588 # Init concat fields for update 4589 sql_query_update_concat_fields = [] 4590 4591 # Fields to avoid 4592 fields_to_avoid = [ 4593 "CONTIG", 4594 "START", 4595 "END", 4596 "REF", 4597 "ALT", 4598 "QUAL", 4599 "FILTER", 4600 "GENOTYPE", 4601 ] 4602 4603 # List all columns to add into header 4604 for header_column in output_results_tsv_columns: 4605 4606 # If header column is enable 4607 if header_column not in fields_to_avoid: 4608 4609 # Header info type 4610 header_info_type = "String" 4611 header_column_df = output_results_tsv_df[header_column] 4612 header_column_df_dtype = header_column_df.dtype 4613 if header_column_df_dtype == object: 4614 if ( 4615 pd.to_numeric(header_column_df, errors="coerce") 4616 .notnull() 4617 .all() 4618 ): 4619 header_info_type = "Float" 4620 else: 4621 header_info_type = "Integer" 4622 4623 # Header info 4624 characters_to_validate = ["-"] 4625 pattern = "[" + "".join(characters_to_validate) + "]" 4626 header_info_name = re.sub( 4627 pattern, 4628 "_", 4629 f"Exomiser_{header_column}".replace("#", ""), 4630 ) 4631 header_info_number = "." 4632 header_info_description = ( 4633 f"Exomiser {header_column} annotation" 4634 ) 4635 header_info_source = "Exomiser" 4636 header_info_version = "unknown" 4637 header_info_code = CODE_TYPE_MAP[header_info_type] 4638 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4639 header_info_name, 4640 header_info_number, 4641 header_info_type, 4642 header_info_description, 4643 header_info_source, 4644 header_info_version, 4645 header_info_code, 4646 ) 4647 4648 # Add field to add for update to concat fields 4649 sql_query_update_concat_fields.append( 4650 f""" 4651 CASE 4652 WHEN table_parquet."{header_column}" NOT IN ('','.') 4653 THEN concat( 4654 '{header_info_name}=', 4655 table_parquet."{header_column}", 4656 ';' 4657 ) 4658 4659 ELSE '' 4660 END 4661 """ 4662 ) 4663 4664 # Update query 4665 sql_query_update = f""" 4666 UPDATE {table_variants} as table_variants 4667 SET INFO = concat( 4668 CASE 4669 WHEN INFO NOT IN ('', '.') 4670 THEN INFO 4671 ELSE '' 4672 END, 4673 CASE 4674 WHEN table_variants.INFO NOT IN ('','.') 4675 THEN ';' 4676 ELSE '' 4677 END, 4678 ( 4679 SELECT 4680 concat( 4681 {",".join(sql_query_update_concat_fields)} 4682 ) 4683 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4684 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4685 AND table_parquet.\"START\" = table_variants.\"POS\" 4686 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4687 AND table_parquet.\"REF\" = table_variants.\"REF\" 4688 ) 4689 ) 4690 ; 4691 """ 4692 4693 # Update 4694 self.conn.execute(sql_query_update) 4695 4696 ### Annotate with VCF INFO field ### 4697 4698 # Init result VCF file 4699 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4700 4701 # If VCF exists 4702 if os.path.exists(output_results_vcf): 4703 4704 # Log 4705 log.debug("Exomiser result VCF update variants") 4706 4707 # Find Exomiser INFO field annotation in header 4708 with gzip.open(output_results_vcf, "rt") as f: 4709 header_list = self.read_vcf_header(f) 4710 exomiser_vcf_header = vcf.Reader( 4711 io.StringIO("\n".join(header_list)) 4712 ) 4713 4714 # Add annotation INFO field to header 4715 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4716 4717 # Update variants with VCF 4718 self.update_from_vcf(output_results_vcf) 4719 4720 return True 4721 4722 def annotation_snpeff(self, threads: int = None) -> None: 4723 """ 4724 This function annotate with snpEff 4725 4726 :param threads: The number of threads to use 4727 :return: the value of the variable "return_value". 4728 """ 4729 4730 # DEBUG 4731 log.debug("Start annotation with snpeff databases") 4732 4733 # Threads 4734 if not threads: 4735 threads = self.get_threads() 4736 log.debug("Threads: " + str(threads)) 4737 4738 # DEBUG 4739 delete_tmp = True 4740 if self.get_config().get("verbosity", "warning") in ["debug"]: 4741 delete_tmp = False 4742 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4743 4744 # Config 4745 config = self.get_config() 4746 log.debug("Config: " + str(config)) 4747 4748 # Config - Folders - Databases 4749 databases_folders = ( 4750 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4751 ) 4752 log.debug("Databases annotations: " + str(databases_folders)) 4753 4754 # # Config - Java 4755 # java_bin = get_bin( 4756 # tool="java", 4757 # bin="java", 4758 # bin_type="bin", 4759 # config=config, 4760 # default_folder="/usr/bin", 4761 # ) 4762 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4763 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4764 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4765 4766 # # Config - snpEff bin 4767 # snpeff_jar = get_bin( 4768 # tool="snpeff", 4769 # bin="snpEff.jar", 4770 # bin_type="jar", 4771 # config=config, 4772 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4773 # ) 4774 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4775 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4776 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4777 4778 # Config - snpEff bin command 4779 snpeff_bin_command = get_bin_command( 4780 bin="snpEff.jar", 4781 tool="snpeff", 4782 bin_type="jar", 4783 config=config, 4784 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4785 ) 4786 if not snpeff_bin_command: 4787 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4788 log.error(msg_err) 4789 raise ValueError(msg_err) 4790 4791 # Config - snpEff databases 4792 snpeff_databases = ( 4793 config.get("folders", {}) 4794 .get("databases", {}) 4795 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4796 ) 4797 snpeff_databases = full_path(snpeff_databases) 4798 if snpeff_databases is not None and snpeff_databases != "": 4799 log.debug(f"Create snpEff databases folder") 4800 if not os.path.exists(snpeff_databases): 4801 os.makedirs(snpeff_databases) 4802 4803 # Param 4804 param = self.get_param() 4805 log.debug("Param: " + str(param)) 4806 4807 # Param 4808 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4809 log.debug("Options: " + str(options)) 4810 4811 # Param - Assembly 4812 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4813 4814 # Param - Options 4815 snpeff_options = ( 4816 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4817 ) 4818 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4819 snpeff_csvstats = ( 4820 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4821 ) 4822 if snpeff_stats: 4823 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4824 snpeff_stats = full_path(snpeff_stats) 4825 snpeff_options += f" -stats {snpeff_stats}" 4826 if snpeff_csvstats: 4827 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4828 snpeff_csvstats = full_path(snpeff_csvstats) 4829 snpeff_options += f" -csvStats {snpeff_csvstats}" 4830 4831 # Data 4832 table_variants = self.get_table_variants() 4833 4834 # Check if not empty 4835 log.debug("Check if not empty") 4836 sql_query_chromosomes = ( 4837 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4838 ) 4839 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4840 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4841 log.info(f"VCF empty") 4842 return 4843 4844 # Export in VCF 4845 log.debug("Create initial file to annotate") 4846 tmp_vcf = NamedTemporaryFile( 4847 prefix=self.get_prefix(), 4848 dir=self.get_tmp_dir(), 4849 suffix=".vcf.gz", 4850 delete=True, 4851 ) 4852 tmp_vcf_name = tmp_vcf.name 4853 4854 # VCF header 4855 vcf_reader = self.get_header() 4856 log.debug("Initial header: " + str(vcf_reader.infos)) 4857 4858 # Existing annotations 4859 for vcf_annotation in self.get_header().infos: 4860 4861 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4862 log.debug( 4863 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4864 ) 4865 4866 # Memory limit 4867 # if config.get("memory", None): 4868 # memory_limit = config.get("memory", "8G") 4869 # else: 4870 # memory_limit = "8G" 4871 memory_limit = self.get_memory("8G") 4872 log.debug(f"memory_limit: {memory_limit}") 4873 4874 # snpEff java options 4875 snpeff_java_options = ( 4876 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4877 ) 4878 log.debug(f"Exomiser java options: {snpeff_java_options}") 4879 4880 force_update_annotation = True 4881 4882 if "ANN" not in self.get_header().infos or force_update_annotation: 4883 4884 # Check snpEff database 4885 log.debug(f"Check snpEff databases {[assembly]}") 4886 databases_download_snpeff( 4887 folder=snpeff_databases, assemblies=[assembly], config=config 4888 ) 4889 4890 # Export VCF file 4891 self.export_variant_vcf( 4892 vcf_file=tmp_vcf_name, 4893 remove_info=True, 4894 add_samples=False, 4895 index=True, 4896 ) 4897 4898 # Tmp file 4899 err_files = [] 4900 tmp_annotate_vcf = NamedTemporaryFile( 4901 prefix=self.get_prefix(), 4902 dir=self.get_tmp_dir(), 4903 suffix=".vcf", 4904 delete=False, 4905 ) 4906 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4907 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4908 err_files.append(tmp_annotate_vcf_name_err) 4909 4910 # Command 4911 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4912 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4913 run_parallel_commands([snpeff_command], 1) 4914 4915 # Error messages 4916 log.info(f"Error/Warning messages:") 4917 error_message_command_all = [] 4918 error_message_command_warning = [] 4919 error_message_command_err = [] 4920 for err_file in err_files: 4921 with open(err_file, "r") as f: 4922 for line in f: 4923 message = line.strip() 4924 error_message_command_all.append(message) 4925 if line.startswith("[W::"): 4926 error_message_command_warning.append(message) 4927 if line.startswith("[E::"): 4928 error_message_command_err.append(f"{err_file}: " + message) 4929 # log info 4930 for message in list( 4931 set(error_message_command_err + error_message_command_warning) 4932 ): 4933 log.info(f" {message}") 4934 # debug info 4935 for message in list(set(error_message_command_all)): 4936 log.debug(f" {message}") 4937 # failed 4938 if len(error_message_command_err): 4939 log.error("Annotation failed: Error in commands") 4940 raise ValueError("Annotation failed: Error in commands") 4941 4942 # Find annotation in header 4943 with open(tmp_annotate_vcf_name, "rt") as f: 4944 header_list = self.read_vcf_header(f) 4945 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4946 4947 for ann in annovar_vcf_header.infos: 4948 if ann not in self.get_header().infos: 4949 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4950 4951 # Update variants 4952 log.info(f"Annotation - Updating...") 4953 self.update_from_vcf(tmp_annotate_vcf_name) 4954 4955 else: 4956 if "ANN" in self.get_header().infos: 4957 log.debug(f"Existing snpEff annotations in VCF") 4958 if force_update_annotation: 4959 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 4960 4961 def annotation_annovar(self, threads: int = None) -> None: 4962 """ 4963 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4964 annotations 4965 4966 :param threads: number of threads to use 4967 :return: the value of the variable "return_value". 4968 """ 4969 4970 # DEBUG 4971 log.debug("Start annotation with Annovar databases") 4972 4973 # Threads 4974 if not threads: 4975 threads = self.get_threads() 4976 log.debug("Threads: " + str(threads)) 4977 4978 # Tmp en Err files 4979 tmp_files = [] 4980 err_files = [] 4981 4982 # DEBUG 4983 delete_tmp = True 4984 if self.get_config().get("verbosity", "warning") in ["debug"]: 4985 delete_tmp = False 4986 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4987 4988 # Config 4989 config = self.get_config() 4990 log.debug("Config: " + str(config)) 4991 4992 # Config - Folders - Databases 4993 databases_folders = ( 4994 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4995 ) 4996 log.debug("Databases annotations: " + str(databases_folders)) 4997 4998 # Config - annovar bin command 4999 annovar_bin_command = get_bin_command( 5000 bin="table_annovar.pl", 5001 tool="annovar", 5002 bin_type="perl", 5003 config=config, 5004 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5005 ) 5006 if not annovar_bin_command: 5007 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5008 log.error(msg_err) 5009 raise ValueError(msg_err) 5010 5011 # Config - BCFTools bin command 5012 bcftools_bin_command = get_bin_command( 5013 bin="bcftools", 5014 tool="bcftools", 5015 bin_type="bin", 5016 config=config, 5017 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5018 ) 5019 if not bcftools_bin_command: 5020 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5021 log.error(msg_err) 5022 raise ValueError(msg_err) 5023 5024 # Config - annovar databases 5025 annovar_databases = ( 5026 config.get("folders", {}) 5027 .get("databases", {}) 5028 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5029 ) 5030 annovar_databases = full_path(annovar_databases) 5031 if annovar_databases != "" and not os.path.exists(annovar_databases): 5032 os.makedirs(annovar_databases) 5033 5034 # Param 5035 param = self.get_param() 5036 log.debug("Param: " + str(param)) 5037 5038 # Param - options 5039 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5040 log.debug("Options: " + str(options)) 5041 5042 # Param - annotations 5043 annotations = ( 5044 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5045 ) 5046 log.debug("Annotations: " + str(annotations)) 5047 5048 # Param - Assembly 5049 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5050 5051 # Annovar database assembly 5052 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5053 if annovar_databases_assembly != "" and not os.path.exists( 5054 annovar_databases_assembly 5055 ): 5056 os.makedirs(annovar_databases_assembly) 5057 5058 # Data 5059 table_variants = self.get_table_variants() 5060 5061 # Check if not empty 5062 log.debug("Check if not empty") 5063 sql_query_chromosomes = ( 5064 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5065 ) 5066 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5067 if not sql_query_chromosomes_df["count"][0]: 5068 log.info(f"VCF empty") 5069 return 5070 5071 # VCF header 5072 vcf_reader = self.get_header() 5073 log.debug("Initial header: " + str(vcf_reader.infos)) 5074 5075 # Existing annotations 5076 for vcf_annotation in self.get_header().infos: 5077 5078 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5079 log.debug( 5080 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5081 ) 5082 5083 force_update_annotation = True 5084 5085 if annotations: 5086 5087 commands = [] 5088 tmp_annotates_vcf_name_list = [] 5089 5090 # Export in VCF 5091 log.debug("Create initial file to annotate") 5092 tmp_vcf = NamedTemporaryFile( 5093 prefix=self.get_prefix(), 5094 dir=self.get_tmp_dir(), 5095 suffix=".vcf.gz", 5096 delete=False, 5097 ) 5098 tmp_vcf_name = tmp_vcf.name 5099 tmp_files.append(tmp_vcf_name) 5100 tmp_files.append(tmp_vcf_name + ".tbi") 5101 5102 # Export VCF file 5103 self.export_variant_vcf( 5104 vcf_file=tmp_vcf_name, 5105 remove_info=".", 5106 add_samples=False, 5107 index=True, 5108 ) 5109 5110 # Create file for field rename 5111 log.debug("Create file for field rename") 5112 tmp_rename = NamedTemporaryFile( 5113 prefix=self.get_prefix(), 5114 dir=self.get_tmp_dir(), 5115 suffix=".rename", 5116 delete=False, 5117 ) 5118 tmp_rename_name = tmp_rename.name 5119 tmp_files.append(tmp_rename_name) 5120 5121 # Check Annovar database 5122 log.debug( 5123 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5124 ) 5125 databases_download_annovar( 5126 folder=annovar_databases, 5127 files=list(annotations.keys()), 5128 assemblies=[assembly], 5129 ) 5130 5131 for annotation in annotations: 5132 annotation_fields = annotations[annotation] 5133 5134 if not annotation_fields: 5135 annotation_fields = {"INFO": None} 5136 5137 log.info(f"Annotations Annovar - database '{annotation}'") 5138 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5139 5140 # Tmp file for annovar 5141 err_files = [] 5142 tmp_annotate_vcf_directory = TemporaryDirectory( 5143 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5144 ) 5145 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5146 tmp_annotate_vcf_name_annovar = ( 5147 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5148 ) 5149 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5150 err_files.append(tmp_annotate_vcf_name_err) 5151 tmp_files.append(tmp_annotate_vcf_name_err) 5152 5153 # Tmp file final vcf annotated by annovar 5154 tmp_annotate_vcf = NamedTemporaryFile( 5155 prefix=self.get_prefix(), 5156 dir=self.get_tmp_dir(), 5157 suffix=".vcf.gz", 5158 delete=False, 5159 ) 5160 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5161 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5162 tmp_files.append(tmp_annotate_vcf_name) 5163 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5164 5165 # Number of fields 5166 annotation_list = [] 5167 annotation_renamed_list = [] 5168 5169 for annotation_field in annotation_fields: 5170 5171 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5172 annotation_fields_new_name = annotation_fields.get( 5173 annotation_field, annotation_field 5174 ) 5175 if not annotation_fields_new_name: 5176 annotation_fields_new_name = annotation_field 5177 5178 if ( 5179 force_update_annotation 5180 or annotation_fields_new_name not in self.get_header().infos 5181 ): 5182 annotation_list.append(annotation_field) 5183 annotation_renamed_list.append(annotation_fields_new_name) 5184 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5185 log.warning( 5186 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5187 ) 5188 5189 # Add rename info 5190 run_parallel_commands( 5191 [ 5192 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5193 ], 5194 1, 5195 ) 5196 5197 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5198 log.debug("annotation_list: " + str(annotation_list)) 5199 5200 # protocol 5201 protocol = annotation 5202 5203 # argument 5204 argument = "" 5205 5206 # operation 5207 operation = "f" 5208 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5209 "ensGene" 5210 ): 5211 operation = "g" 5212 if options.get("genebase", None): 5213 argument = f"""'{options.get("genebase","")}'""" 5214 elif annotation in ["cytoBand"]: 5215 operation = "r" 5216 5217 # argument option 5218 argument_option = "" 5219 if argument != "": 5220 argument_option = " --argument " + argument 5221 5222 # command options 5223 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5224 for option in options: 5225 if option not in ["genebase"]: 5226 command_options += f""" --{option}={options[option]}""" 5227 5228 # Command 5229 5230 # Command - Annovar 5231 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5232 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5233 5234 # Command - start pipe 5235 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5236 5237 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5238 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5239 5240 # Command - Special characters (refGene annotation) 5241 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5242 5243 # Command - Clean empty fields (with value ".") 5244 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5245 5246 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5247 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5248 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5249 # for ann in annotation_renamed_list: 5250 for ann in annotation_list: 5251 annovar_fields_to_keep.append(f"^INFO/{ann}") 5252 5253 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5254 5255 # Command - indexing 5256 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5257 5258 log.debug(f"Annotation - Annovar command: {command_annovar}") 5259 run_parallel_commands([command_annovar], 1) 5260 5261 # Error messages 5262 log.info(f"Error/Warning messages:") 5263 error_message_command_all = [] 5264 error_message_command_warning = [] 5265 error_message_command_err = [] 5266 for err_file in err_files: 5267 with open(err_file, "r") as f: 5268 for line in f: 5269 message = line.strip() 5270 error_message_command_all.append(message) 5271 if line.startswith("[W::") or line.startswith("WARNING"): 5272 error_message_command_warning.append(message) 5273 if line.startswith("[E::") or line.startswith("ERROR"): 5274 error_message_command_err.append( 5275 f"{err_file}: " + message 5276 ) 5277 # log info 5278 for message in list( 5279 set(error_message_command_err + error_message_command_warning) 5280 ): 5281 log.info(f" {message}") 5282 # debug info 5283 for message in list(set(error_message_command_all)): 5284 log.debug(f" {message}") 5285 # failed 5286 if len(error_message_command_err): 5287 log.error("Annotation failed: Error in commands") 5288 raise ValueError("Annotation failed: Error in commands") 5289 5290 if tmp_annotates_vcf_name_list: 5291 5292 # List of annotated files 5293 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5294 5295 # Tmp file 5296 tmp_annotate_vcf = NamedTemporaryFile( 5297 prefix=self.get_prefix(), 5298 dir=self.get_tmp_dir(), 5299 suffix=".vcf.gz", 5300 delete=False, 5301 ) 5302 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5303 tmp_files.append(tmp_annotate_vcf_name) 5304 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5305 err_files.append(tmp_annotate_vcf_name_err) 5306 tmp_files.append(tmp_annotate_vcf_name_err) 5307 5308 # Command merge 5309 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5310 log.info( 5311 f"Annotation Annovar - Annotation merging " 5312 + str(len(tmp_annotates_vcf_name_list)) 5313 + " annotated files" 5314 ) 5315 log.debug(f"Annotation - merge command: {merge_command}") 5316 run_parallel_commands([merge_command], 1) 5317 5318 # Find annotation in header 5319 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5320 header_list = self.read_vcf_header(f) 5321 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5322 5323 for ann in annovar_vcf_header.infos: 5324 if ann not in self.get_header().infos: 5325 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5326 5327 # Update variants 5328 log.info(f"Annotation Annovar - Updating...") 5329 self.update_from_vcf(tmp_annotate_vcf_name) 5330 5331 # Clean files 5332 # Tmp file remove command 5333 if True: 5334 tmp_files_remove_command = "" 5335 if tmp_files: 5336 tmp_files_remove_command = " ".join(tmp_files) 5337 clean_command = f" rm -f {tmp_files_remove_command} " 5338 log.debug(f"Annotation Annovar - Annotation cleaning ") 5339 log.debug(f"Annotation - cleaning command: {clean_command}") 5340 run_parallel_commands([clean_command], 1) 5341 5342 # Parquet 5343 def annotation_parquet(self, threads: int = None) -> None: 5344 """ 5345 It takes a VCF file, and annotates it with a parquet file 5346 5347 :param threads: number of threads to use for the annotation 5348 :return: the value of the variable "result". 5349 """ 5350 5351 # DEBUG 5352 log.debug("Start annotation with parquet databases") 5353 5354 # Threads 5355 if not threads: 5356 threads = self.get_threads() 5357 log.debug("Threads: " + str(threads)) 5358 5359 # DEBUG 5360 delete_tmp = True 5361 if self.get_config().get("verbosity", "warning") in ["debug"]: 5362 delete_tmp = False 5363 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5364 5365 # Config 5366 databases_folders = set( 5367 self.get_config() 5368 .get("folders", {}) 5369 .get("databases", {}) 5370 .get("annotations", ["."]) 5371 + self.get_config() 5372 .get("folders", {}) 5373 .get("databases", {}) 5374 .get("parquet", ["."]) 5375 ) 5376 log.debug("Databases annotations: " + str(databases_folders)) 5377 5378 # Param 5379 annotations = ( 5380 self.get_param() 5381 .get("annotation", {}) 5382 .get("parquet", {}) 5383 .get("annotations", None) 5384 ) 5385 log.debug("Annotations: " + str(annotations)) 5386 5387 # Assembly 5388 assembly = self.get_param().get( 5389 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5390 ) 5391 5392 # Force Update Annotation 5393 force_update_annotation = ( 5394 self.get_param() 5395 .get("annotation", {}) 5396 .get("options", {}) 5397 .get("annotations_update", False) 5398 ) 5399 log.debug(f"force_update_annotation={force_update_annotation}") 5400 force_append_annotation = ( 5401 self.get_param() 5402 .get("annotation", {}) 5403 .get("options", {}) 5404 .get("annotations_append", False) 5405 ) 5406 log.debug(f"force_append_annotation={force_append_annotation}") 5407 5408 # Data 5409 table_variants = self.get_table_variants() 5410 5411 # Check if not empty 5412 log.debug("Check if not empty") 5413 sql_query_chromosomes_df = self.get_query_to_df( 5414 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5415 ) 5416 if not sql_query_chromosomes_df["count"][0]: 5417 log.info(f"VCF empty") 5418 return 5419 5420 # VCF header 5421 vcf_reader = self.get_header() 5422 log.debug("Initial header: " + str(vcf_reader.infos)) 5423 5424 # Nb Variants POS 5425 log.debug("NB Variants Start") 5426 nb_variants = self.conn.execute( 5427 f"SELECT count(*) AS count FROM variants" 5428 ).fetchdf()["count"][0] 5429 log.debug("NB Variants Stop") 5430 5431 # Existing annotations 5432 for vcf_annotation in self.get_header().infos: 5433 5434 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5435 log.debug( 5436 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5437 ) 5438 5439 # Added columns 5440 added_columns = [] 5441 5442 # drop indexes 5443 log.debug(f"Drop indexes...") 5444 self.drop_indexes() 5445 5446 if annotations: 5447 5448 if "ALL" in annotations: 5449 5450 all_param = annotations.get("ALL", {}) 5451 all_param_formats = all_param.get("formats", None) 5452 all_param_releases = all_param.get("releases", None) 5453 5454 databases_infos_dict = self.scan_databases( 5455 database_formats=all_param_formats, 5456 database_releases=all_param_releases, 5457 ) 5458 for database_infos in databases_infos_dict.keys(): 5459 if database_infos not in annotations: 5460 annotations[database_infos] = {"INFO": None} 5461 5462 for annotation in annotations: 5463 5464 if annotation in ["ALL"]: 5465 continue 5466 5467 # Annotation Name 5468 annotation_name = os.path.basename(annotation) 5469 5470 # Annotation fields 5471 annotation_fields = annotations[annotation] 5472 if not annotation_fields: 5473 annotation_fields = {"INFO": None} 5474 5475 log.debug(f"Annotation '{annotation_name}'") 5476 log.debug( 5477 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5478 ) 5479 5480 # Create Database 5481 database = Database( 5482 database=annotation, 5483 databases_folders=databases_folders, 5484 assembly=assembly, 5485 ) 5486 5487 # Find files 5488 parquet_file = database.get_database() 5489 parquet_hdr_file = database.get_header_file() 5490 parquet_type = database.get_type() 5491 5492 # Check if files exists 5493 if not parquet_file or not parquet_hdr_file: 5494 log.error("Annotation failed: file not found") 5495 raise ValueError("Annotation failed: file not found") 5496 else: 5497 # Get parquet connexion 5498 parquet_sql_attach = database.get_sql_database_attach( 5499 output="query" 5500 ) 5501 if parquet_sql_attach: 5502 self.conn.execute(parquet_sql_attach) 5503 parquet_file_link = database.get_sql_database_link() 5504 # Log 5505 log.debug( 5506 f"Annotation '{annotation_name}' - file: " 5507 + str(parquet_file) 5508 + " and " 5509 + str(parquet_hdr_file) 5510 ) 5511 5512 # Database full header columns 5513 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5514 parquet_hdr_file 5515 ) 5516 # Log 5517 log.debug( 5518 "Annotation database header columns : " 5519 + str(parquet_hdr_vcf_header_columns) 5520 ) 5521 5522 # Load header as VCF object 5523 parquet_hdr_vcf_header_infos = database.get_header().infos 5524 # Log 5525 log.debug( 5526 "Annotation database header: " 5527 + str(parquet_hdr_vcf_header_infos) 5528 ) 5529 5530 # Get extra infos 5531 parquet_columns = database.get_extra_columns() 5532 # Log 5533 log.debug("Annotation database Columns: " + str(parquet_columns)) 5534 5535 # Add extra columns if "ALL" in annotation_fields 5536 # if "ALL" in annotation_fields: 5537 # allow_add_extra_column = True 5538 if "ALL" in annotation_fields and database.get_extra_columns(): 5539 for extra_column in database.get_extra_columns(): 5540 if ( 5541 extra_column not in annotation_fields 5542 and extra_column.replace("INFO/", "") 5543 not in parquet_hdr_vcf_header_infos 5544 ): 5545 parquet_hdr_vcf_header_infos[extra_column] = ( 5546 vcf.parser._Info( 5547 extra_column, 5548 ".", 5549 "String", 5550 f"{extra_column} description", 5551 "unknown", 5552 "unknown", 5553 self.code_type_map["String"], 5554 ) 5555 ) 5556 5557 # For all fields in database 5558 annotation_fields_all = False 5559 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5560 annotation_fields_all = True 5561 annotation_fields = { 5562 key: key for key in parquet_hdr_vcf_header_infos 5563 } 5564 5565 log.debug( 5566 "Annotation database header - All annotations added: " 5567 + str(annotation_fields) 5568 ) 5569 5570 # Init 5571 5572 # List of annotation fields to use 5573 sql_query_annotation_update_info_sets = [] 5574 5575 # List of annotation to agregate 5576 sql_query_annotation_to_agregate = [] 5577 5578 # Number of fields 5579 nb_annotation_field = 0 5580 5581 # Annotation fields processed 5582 annotation_fields_processed = [] 5583 5584 # Columns mapping 5585 map_columns = database.map_columns( 5586 columns=annotation_fields, prefixes=["INFO/"] 5587 ) 5588 5589 # Query dict for fields to remove (update option) 5590 query_dict_remove = {} 5591 5592 # Fetch Anotation fields 5593 for annotation_field in annotation_fields: 5594 5595 # annotation_field_column 5596 annotation_field_column = map_columns.get( 5597 annotation_field, "INFO" 5598 ) 5599 5600 # field new name, if parametered 5601 annotation_fields_new_name = annotation_fields.get( 5602 annotation_field, annotation_field 5603 ) 5604 if not annotation_fields_new_name: 5605 annotation_fields_new_name = annotation_field 5606 5607 # To annotate 5608 # force_update_annotation = True 5609 # force_append_annotation = True 5610 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5611 if annotation_field in parquet_hdr_vcf_header_infos and ( 5612 force_update_annotation 5613 or force_append_annotation 5614 or ( 5615 annotation_fields_new_name 5616 not in self.get_header().infos 5617 ) 5618 ): 5619 5620 # Add field to annotation to process list 5621 annotation_fields_processed.append( 5622 annotation_fields_new_name 5623 ) 5624 5625 # explode infos for the field 5626 annotation_fields_new_name_info_msg = "" 5627 if ( 5628 force_update_annotation 5629 and annotation_fields_new_name 5630 in self.get_header().infos 5631 ): 5632 # Remove field from INFO 5633 query = f""" 5634 UPDATE {table_variants} as table_variants 5635 SET INFO = REGEXP_REPLACE( 5636 concat(table_variants.INFO,''), 5637 ';*{annotation_fields_new_name}=[^;]*', 5638 '' 5639 ) 5640 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5641 """ 5642 annotation_fields_new_name_info_msg = " [update]" 5643 query_dict_remove[ 5644 f"remove 'INFO/{annotation_fields_new_name}'" 5645 ] = query 5646 5647 # Sep between fields in INFO 5648 nb_annotation_field += 1 5649 if nb_annotation_field > 1: 5650 annotation_field_sep = ";" 5651 else: 5652 annotation_field_sep = "" 5653 5654 log.info( 5655 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5656 ) 5657 5658 # Add INFO field to header 5659 parquet_hdr_vcf_header_infos_number = ( 5660 parquet_hdr_vcf_header_infos[annotation_field].num 5661 or "." 5662 ) 5663 parquet_hdr_vcf_header_infos_type = ( 5664 parquet_hdr_vcf_header_infos[annotation_field].type 5665 or "String" 5666 ) 5667 parquet_hdr_vcf_header_infos_description = ( 5668 parquet_hdr_vcf_header_infos[annotation_field].desc 5669 or f"{annotation_field} description" 5670 ) 5671 parquet_hdr_vcf_header_infos_source = ( 5672 parquet_hdr_vcf_header_infos[annotation_field].source 5673 or "unknown" 5674 ) 5675 parquet_hdr_vcf_header_infos_version = ( 5676 parquet_hdr_vcf_header_infos[annotation_field].version 5677 or "unknown" 5678 ) 5679 5680 vcf_reader.infos[annotation_fields_new_name] = ( 5681 vcf.parser._Info( 5682 annotation_fields_new_name, 5683 parquet_hdr_vcf_header_infos_number, 5684 parquet_hdr_vcf_header_infos_type, 5685 parquet_hdr_vcf_header_infos_description, 5686 parquet_hdr_vcf_header_infos_source, 5687 parquet_hdr_vcf_header_infos_version, 5688 self.code_type_map[ 5689 parquet_hdr_vcf_header_infos_type 5690 ], 5691 ) 5692 ) 5693 5694 # Append 5695 if force_append_annotation: 5696 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5697 else: 5698 query_case_when_append = "" 5699 5700 # Annotation/Update query fields 5701 # Found in INFO column 5702 if ( 5703 annotation_field_column == "INFO" 5704 and "INFO" in parquet_hdr_vcf_header_columns 5705 ): 5706 sql_query_annotation_update_info_sets.append( 5707 f""" 5708 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5709 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5710 ELSE '' 5711 END 5712 """ 5713 ) 5714 # Found in a specific column 5715 else: 5716 sql_query_annotation_update_info_sets.append( 5717 f""" 5718 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5719 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5720 ELSE '' 5721 END 5722 """ 5723 ) 5724 sql_query_annotation_to_agregate.append( 5725 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5726 ) 5727 5728 # Not to annotate 5729 else: 5730 5731 if force_update_annotation: 5732 annotation_message = "forced" 5733 else: 5734 annotation_message = "skipped" 5735 5736 if annotation_field not in parquet_hdr_vcf_header_infos: 5737 log.warning( 5738 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5739 ) 5740 if annotation_fields_new_name in self.get_header().infos: 5741 log.warning( 5742 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5743 ) 5744 5745 # Check if ALL fields have to be annotated. Thus concat all INFO field 5746 # allow_annotation_full_info = True 5747 allow_annotation_full_info = not force_append_annotation 5748 5749 if parquet_type in ["regions"]: 5750 allow_annotation_full_info = False 5751 5752 if ( 5753 allow_annotation_full_info 5754 and nb_annotation_field == len(annotation_fields) 5755 and annotation_fields_all 5756 and ( 5757 "INFO" in parquet_hdr_vcf_header_columns 5758 and "INFO" in database.get_extra_columns() 5759 ) 5760 ): 5761 log.debug("Column INFO annotation enabled") 5762 sql_query_annotation_update_info_sets = [] 5763 sql_query_annotation_update_info_sets.append( 5764 f" table_parquet.INFO " 5765 ) 5766 5767 if sql_query_annotation_update_info_sets: 5768 5769 # Annotate 5770 log.info(f"Annotation '{annotation_name}' - Annotation...") 5771 5772 # Join query annotation update info sets for SQL 5773 sql_query_annotation_update_info_sets_sql = ",".join( 5774 sql_query_annotation_update_info_sets 5775 ) 5776 5777 # Check chromosomes list (and variants infos) 5778 sql_query_chromosomes = f""" 5779 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5780 FROM {table_variants} as table_variants 5781 GROUP BY table_variants."#CHROM" 5782 ORDER BY table_variants."#CHROM" 5783 """ 5784 sql_query_chromosomes_df = self.conn.execute( 5785 sql_query_chromosomes 5786 ).df() 5787 sql_query_chromosomes_dict = { 5788 entry["CHROM"]: { 5789 "count": entry["count_variants"], 5790 "min": entry["min_variants"], 5791 "max": entry["max_variants"], 5792 } 5793 for index, entry in sql_query_chromosomes_df.iterrows() 5794 } 5795 5796 # Init 5797 nb_of_query = 0 5798 nb_of_variant_annotated = 0 5799 query_dict = query_dict_remove 5800 5801 # for chrom in sql_query_chromosomes_df["CHROM"]: 5802 for chrom in sql_query_chromosomes_dict: 5803 5804 # Number of variant by chromosome 5805 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5806 chrom, {} 5807 ).get("count", 0) 5808 5809 log.debug( 5810 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5811 ) 5812 5813 # Annotation with regions database 5814 if parquet_type in ["regions"]: 5815 sql_query_annotation_from_clause = f""" 5816 FROM ( 5817 SELECT 5818 '{chrom}' AS \"#CHROM\", 5819 table_variants_from.\"POS\" AS \"POS\", 5820 {",".join(sql_query_annotation_to_agregate)} 5821 FROM {table_variants} as table_variants_from 5822 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5823 table_parquet_from."#CHROM" = '{chrom}' 5824 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5825 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5826 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5827 ) 5828 ) 5829 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5830 GROUP BY table_variants_from.\"POS\" 5831 ) 5832 as table_parquet 5833 """ 5834 5835 sql_query_annotation_where_clause = """ 5836 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5837 AND table_parquet.\"POS\" = table_variants.\"POS\" 5838 """ 5839 5840 # Annotation with variants database 5841 else: 5842 sql_query_annotation_from_clause = f""" 5843 FROM {parquet_file_link} as table_parquet 5844 """ 5845 sql_query_annotation_where_clause = f""" 5846 table_variants."#CHROM" = '{chrom}' 5847 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5848 AND table_parquet.\"POS\" = table_variants.\"POS\" 5849 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5850 AND table_parquet.\"REF\" = table_variants.\"REF\" 5851 """ 5852 5853 # Create update query 5854 sql_query_annotation_chrom_interval_pos = f""" 5855 UPDATE {table_variants} as table_variants 5856 SET INFO = 5857 concat( 5858 CASE WHEN table_variants.INFO NOT IN ('','.') 5859 THEN table_variants.INFO 5860 ELSE '' 5861 END 5862 , 5863 CASE WHEN table_variants.INFO NOT IN ('','.') 5864 AND ( 5865 concat({sql_query_annotation_update_info_sets_sql}) 5866 ) 5867 NOT IN ('','.') 5868 THEN ';' 5869 ELSE '' 5870 END 5871 , 5872 {sql_query_annotation_update_info_sets_sql} 5873 ) 5874 {sql_query_annotation_from_clause} 5875 WHERE {sql_query_annotation_where_clause} 5876 ; 5877 """ 5878 5879 # Add update query to dict 5880 query_dict[ 5881 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5882 ] = sql_query_annotation_chrom_interval_pos 5883 5884 nb_of_query = len(query_dict) 5885 num_query = 0 5886 5887 # SET max_expression_depth TO x 5888 self.conn.execute("SET max_expression_depth TO 10000") 5889 5890 for query_name in query_dict: 5891 query = query_dict[query_name] 5892 num_query += 1 5893 log.info( 5894 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5895 ) 5896 result = self.conn.execute(query) 5897 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5898 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5899 log.info( 5900 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5901 ) 5902 5903 log.info( 5904 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5905 ) 5906 5907 else: 5908 5909 log.info( 5910 f"Annotation '{annotation_name}' - No Annotations available" 5911 ) 5912 5913 log.debug("Final header: " + str(vcf_reader.infos)) 5914 5915 # Remove added columns 5916 for added_column in added_columns: 5917 self.drop_column(column=added_column) 5918 5919 def annotation_splice(self, threads: int = None) -> None: 5920 """ 5921 This function annotate with snpEff 5922 5923 :param threads: The number of threads to use 5924 :return: the value of the variable "return_value". 5925 """ 5926 5927 # DEBUG 5928 log.debug("Start annotation with splice tools") 5929 5930 # Threads 5931 if not threads: 5932 threads = self.get_threads() 5933 log.debug("Threads: " + str(threads)) 5934 5935 # DEBUG 5936 delete_tmp = True 5937 if self.get_config().get("verbosity", "warning") in ["debug"]: 5938 delete_tmp = False 5939 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5940 5941 # Config 5942 config = self.get_config() 5943 log.debug("Config: " + str(config)) 5944 splice_config = config.get("tools", {}).get("splice", {}) 5945 if not splice_config: 5946 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5947 if not splice_config: 5948 msg_err = "No Splice tool config" 5949 log.error(msg_err) 5950 raise ValueError(msg_err) 5951 log.debug(f"splice_config={splice_config}") 5952 5953 # Config - Folders - Databases 5954 databases_folders = ( 5955 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5956 ) 5957 log.debug("Databases annotations: " + str(databases_folders)) 5958 5959 # Splice docker image 5960 splice_docker_image = splice_config.get("docker").get("image") 5961 5962 # Pull splice image if it's not already there 5963 if not check_docker_image_exists(splice_docker_image): 5964 log.warning( 5965 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5966 ) 5967 try: 5968 command(f"docker pull {splice_config.get('docker').get('image')}") 5969 except subprocess.CalledProcessError: 5970 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5971 log.error(msg_err) 5972 raise ValueError(msg_err) 5973 return None 5974 5975 # Config - splice databases 5976 splice_databases = ( 5977 config.get("folders", {}) 5978 .get("databases", {}) 5979 .get("splice", DEFAULT_SPLICE_FOLDER) 5980 ) 5981 splice_databases = full_path(splice_databases) 5982 5983 # Param 5984 param = self.get_param() 5985 log.debug("Param: " + str(param)) 5986 5987 # Param 5988 options = param.get("annotation", {}).get("splice", {}) 5989 log.debug("Options: " + str(options)) 5990 5991 # Data 5992 table_variants = self.get_table_variants() 5993 5994 # Check if not empty 5995 log.debug("Check if not empty") 5996 sql_query_chromosomes = ( 5997 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5998 ) 5999 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6000 log.info("VCF empty") 6001 return None 6002 6003 # Export in VCF 6004 log.debug("Create initial file to annotate") 6005 6006 # Create output folder 6007 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6008 if not os.path.exists(output_folder): 6009 Path(output_folder).mkdir(parents=True, exist_ok=True) 6010 6011 # Create tmp VCF file 6012 tmp_vcf = NamedTemporaryFile( 6013 prefix=self.get_prefix(), 6014 dir=output_folder, 6015 suffix=".vcf", 6016 delete=False, 6017 ) 6018 tmp_vcf_name = tmp_vcf.name 6019 6020 # VCF header 6021 header = self.get_header() 6022 6023 # Existing annotations 6024 for vcf_annotation in self.get_header().infos: 6025 6026 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6027 log.debug( 6028 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6029 ) 6030 6031 # Memory limit 6032 if config.get("memory", None): 6033 memory_limit = config.get("memory", "8G").upper() 6034 # upper() 6035 else: 6036 memory_limit = "8G" 6037 log.debug(f"memory_limit: {memory_limit}") 6038 6039 # Check number of variants to annotate 6040 where_clause_regex_spliceai = r"SpliceAI_\w+" 6041 where_clause_regex_spip = r"SPiP_\w+" 6042 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6043 df_list_of_variants_to_annotate = self.get_query_to_df( 6044 query=f""" SELECT * FROM variants {where_clause} """ 6045 ) 6046 if len(df_list_of_variants_to_annotate) == 0: 6047 log.warning( 6048 f"No variants to annotate with splice. Variants probably already annotated with splice" 6049 ) 6050 return None 6051 else: 6052 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6053 6054 # Export VCF file 6055 self.export_variant_vcf( 6056 vcf_file=tmp_vcf_name, 6057 remove_info=True, 6058 add_samples=True, 6059 index=False, 6060 where_clause=where_clause, 6061 ) 6062 6063 # Create docker container and launch splice analysis 6064 if splice_config: 6065 6066 # Splice mount folders 6067 mount_folders = splice_config.get("mount", {}) 6068 6069 # Genome mount 6070 mount_folders[ 6071 config.get("folders", {}) 6072 .get("databases", {}) 6073 .get("genomes", DEFAULT_GENOME_FOLDER) 6074 ] = "ro" 6075 6076 # SpliceAI mount 6077 mount_folders[ 6078 config.get("folders", {}) 6079 .get("databases", {}) 6080 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6081 ] = "ro" 6082 6083 # Genome mount 6084 mount_folders[ 6085 config.get("folders", {}) 6086 .get("databases", {}) 6087 .get("spip", DEFAULT_SPIP_FOLDER) 6088 ] = "ro" 6089 6090 # Mount folders 6091 mount = [] 6092 6093 # Config mount 6094 mount = [ 6095 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6096 for path, mode in mount_folders.items() 6097 ] 6098 6099 if any(value for value in splice_config.values() if value is None): 6100 log.warning("At least one splice config parameter is empty") 6101 return None 6102 6103 # Params in splice nf 6104 def check_values(dico: dict): 6105 """ 6106 Ensure parameters for NF splice pipeline 6107 """ 6108 for key, val in dico.items(): 6109 if key == "genome": 6110 if any( 6111 assemb in options.get("genome", {}) 6112 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6113 ): 6114 yield f"--{key} hg19" 6115 elif any( 6116 assemb in options.get("genome", {}) 6117 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6118 ): 6119 yield f"--{key} hg38" 6120 elif ( 6121 (isinstance(val, str) and val) 6122 or isinstance(val, int) 6123 or isinstance(val, bool) 6124 ): 6125 yield f"--{key} {val}" 6126 6127 # Genome 6128 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6129 options["genome"] = genome 6130 6131 # NF params 6132 nf_params = [] 6133 6134 # Add options 6135 if options: 6136 nf_params = list(check_values(options)) 6137 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6138 else: 6139 log.debug("No NF params provided") 6140 6141 # Add threads 6142 if "threads" not in options.keys(): 6143 nf_params.append(f"--threads {threads}") 6144 6145 # Genome path 6146 genome_path = find_genome( 6147 config.get("folders", {}) 6148 .get("databases", {}) 6149 .get("genomes", DEFAULT_GENOME_FOLDER), 6150 file=f"{genome}.fa", 6151 ) 6152 # Add genome path 6153 if not genome_path: 6154 raise ValueError( 6155 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6156 ) 6157 else: 6158 log.debug(f"Genome: {genome_path}") 6159 nf_params.append(f"--genome_path {genome_path}") 6160 6161 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6162 """ 6163 Setting up updated databases for SPiP and SpliceAI 6164 """ 6165 6166 try: 6167 6168 # SpliceAI assembly transcriptome 6169 spliceai_assembly = os.path.join( 6170 config.get("folders", {}) 6171 .get("databases", {}) 6172 .get("spliceai", {}), 6173 options.get("genome"), 6174 "transcriptome", 6175 ) 6176 spip_assembly = options.get("genome") 6177 6178 spip = find( 6179 f"transcriptome_{spip_assembly}.RData", 6180 config.get("folders", {}).get("databases", {}).get("spip", {}), 6181 ) 6182 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6183 log.debug(f"SPiP annotations: {spip}") 6184 log.debug(f"SpliceAI annotations: {spliceai}") 6185 if spip and spliceai: 6186 return [ 6187 f"--spip_transcriptome {spip}", 6188 f"--spliceai_annotations {spliceai}", 6189 ] 6190 else: 6191 # TODO crash and go on with basic annotations ? 6192 # raise ValueError( 6193 # "Can't find splice databases in configuration EXIT" 6194 # ) 6195 log.warning( 6196 "Can't find splice databases in configuration, use annotations file from image" 6197 ) 6198 except TypeError: 6199 log.warning( 6200 "Can't find splice databases in configuration, use annotations file from image" 6201 ) 6202 return [] 6203 6204 # Add options, check if transcriptome option have already beend provided 6205 if ( 6206 "spip_transcriptome" not in nf_params 6207 and "spliceai_transcriptome" not in nf_params 6208 ): 6209 splice_reference = splice_annotations(options, config) 6210 if splice_reference: 6211 nf_params.extend(splice_reference) 6212 6213 nf_params.append(f"--output_folder {output_folder}") 6214 6215 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6216 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6217 log.debug(cmd) 6218 6219 splice_config["docker"]["command"] = cmd 6220 6221 docker_cmd = get_bin_command( 6222 tool="splice", 6223 bin_type="docker", 6224 config=config, 6225 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6226 add_options=f"--name {random_uuid} {' '.join(mount)}", 6227 ) 6228 6229 # Docker debug 6230 # if splice_config.get("rm_container"): 6231 # rm_container = "--rm" 6232 # else: 6233 # rm_container = "" 6234 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6235 6236 log.debug(docker_cmd) 6237 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6238 log.debug(res.stdout) 6239 if res.stderr: 6240 log.error(res.stderr) 6241 res.check_returncode() 6242 else: 6243 log.warning(f"Splice tool configuration not found: {config}") 6244 6245 # Update variants 6246 log.info("Annotation - Updating...") 6247 # Test find output vcf 6248 log.debug( 6249 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6250 ) 6251 output_vcf = [] 6252 # Wrong folder to look in 6253 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6254 if ( 6255 files 6256 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6257 ): 6258 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6259 # log.debug(os.listdir(options.get("output_folder"))) 6260 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6261 if not output_vcf: 6262 log.debug( 6263 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6264 ) 6265 else: 6266 # Get new header from annotated vcf 6267 log.debug(f"Initial header: {len(header.infos)} fields") 6268 # Create new header with splice infos 6269 new_vcf = Variants(input=output_vcf[0]) 6270 new_vcf_header = new_vcf.get_header().infos 6271 for keys, infos in new_vcf_header.items(): 6272 if keys not in header.infos.keys(): 6273 header.infos[keys] = infos 6274 log.debug(f"New header: {len(header.infos)} fields") 6275 log.debug(f"Splice tmp output: {output_vcf[0]}") 6276 self.update_from_vcf(output_vcf[0]) 6277 6278 # Remove folder 6279 remove_if_exists(output_folder) 6280 6281 ### 6282 # Prioritization 6283 ### 6284 6285 def get_config_default(self, name: str) -> dict: 6286 """ 6287 The function `get_config_default` returns a dictionary containing default configurations for 6288 various calculations and prioritizations. 6289 6290 :param name: The `get_config_default` function returns a dictionary containing default 6291 configurations for different calculations and prioritizations. The `name` parameter is used to 6292 specify which specific configuration to retrieve from the dictionary 6293 :type name: str 6294 :return: The function `get_config_default` returns a dictionary containing default configuration 6295 settings for different calculations and prioritizations. The specific configuration settings are 6296 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6297 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6298 returned. If there is no match, an empty dictionary is returned. 6299 """ 6300 6301 config_default = { 6302 "calculations": { 6303 "variant_chr_pos_alt_ref": { 6304 "type": "sql", 6305 "name": "variant_chr_pos_alt_ref", 6306 "description": "Create a variant ID with chromosome, position, alt and ref", 6307 "available": False, 6308 "output_column_name": "variant_chr_pos_alt_ref", 6309 "output_column_type": "String", 6310 "output_column_description": "variant ID with chromosome, position, alt and ref", 6311 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6312 "operation_info": True, 6313 }, 6314 "VARTYPE": { 6315 "type": "sql", 6316 "name": "VARTYPE", 6317 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6318 "available": True, 6319 "output_column_name": "VARTYPE", 6320 "output_column_type": "String", 6321 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6322 "operation_query": """ 6323 CASE 6324 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6325 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6326 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6327 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6328 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6329 ELSE 'UNDEFINED' 6330 END 6331 """, 6332 "info_fields": ["SVTYPE"], 6333 "operation_info": True, 6334 }, 6335 "snpeff_hgvs": { 6336 "type": "python", 6337 "name": "snpeff_hgvs", 6338 "description": "HGVS nomenclatures from snpEff annotation", 6339 "available": True, 6340 "function_name": "calculation_extract_snpeff_hgvs", 6341 "function_params": ["snpeff_hgvs", "ANN"], 6342 }, 6343 "snpeff_ann_explode": { 6344 "type": "python", 6345 "name": "snpeff_ann_explode", 6346 "description": "Explode snpEff annotations with uniquify values", 6347 "available": True, 6348 "function_name": "calculation_snpeff_ann_explode", 6349 "function_params": [False, "fields", "snpeff_", "ANN"], 6350 }, 6351 "snpeff_ann_explode_uniquify": { 6352 "type": "python", 6353 "name": "snpeff_ann_explode_uniquify", 6354 "description": "Explode snpEff annotations", 6355 "available": True, 6356 "function_name": "calculation_snpeff_ann_explode", 6357 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6358 }, 6359 "snpeff_ann_explode_json": { 6360 "type": "python", 6361 "name": "snpeff_ann_explode_json", 6362 "description": "Explode snpEff annotations in JSON format", 6363 "available": True, 6364 "function_name": "calculation_snpeff_ann_explode", 6365 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6366 }, 6367 "NOMEN": { 6368 "type": "python", 6369 "name": "NOMEN", 6370 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6371 "available": True, 6372 "function_name": "calculation_extract_nomen", 6373 "function_params": [], 6374 }, 6375 "FINDBYPIPELINE": { 6376 "type": "python", 6377 "name": "FINDBYPIPELINE", 6378 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6379 "available": True, 6380 "function_name": "calculation_find_by_pipeline", 6381 "function_params": ["findbypipeline"], 6382 }, 6383 "FINDBYSAMPLE": { 6384 "type": "python", 6385 "name": "FINDBYSAMPLE", 6386 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6387 "available": True, 6388 "function_name": "calculation_find_by_pipeline", 6389 "function_params": ["findbysample"], 6390 }, 6391 "GENOTYPECONCORDANCE": { 6392 "type": "python", 6393 "name": "GENOTYPECONCORDANCE", 6394 "description": "Concordance of genotype for multi caller VCF", 6395 "available": True, 6396 "function_name": "calculation_genotype_concordance", 6397 "function_params": [], 6398 }, 6399 "BARCODE": { 6400 "type": "python", 6401 "name": "BARCODE", 6402 "description": "BARCODE as VaRank tool", 6403 "available": True, 6404 "function_name": "calculation_barcode", 6405 "function_params": [], 6406 }, 6407 "BARCODEFAMILY": { 6408 "type": "python", 6409 "name": "BARCODEFAMILY", 6410 "description": "BARCODEFAMILY as VaRank tool", 6411 "available": True, 6412 "function_name": "calculation_barcode_family", 6413 "function_params": ["BCF"], 6414 }, 6415 "TRIO": { 6416 "type": "python", 6417 "name": "TRIO", 6418 "description": "Inheritance for a trio family", 6419 "available": True, 6420 "function_name": "calculation_trio", 6421 "function_params": [], 6422 }, 6423 "VAF": { 6424 "type": "python", 6425 "name": "VAF", 6426 "description": "Variant Allele Frequency (VAF) harmonization", 6427 "available": True, 6428 "function_name": "calculation_vaf_normalization", 6429 "function_params": [], 6430 }, 6431 "VAF_stats": { 6432 "type": "python", 6433 "name": "VAF_stats", 6434 "description": "Variant Allele Frequency (VAF) statistics", 6435 "available": True, 6436 "function_name": "calculation_genotype_stats", 6437 "function_params": ["VAF"], 6438 }, 6439 "DP_stats": { 6440 "type": "python", 6441 "name": "DP_stats", 6442 "description": "Depth (DP) statistics", 6443 "available": True, 6444 "function_name": "calculation_genotype_stats", 6445 "function_params": ["DP"], 6446 }, 6447 "variant_id": { 6448 "type": "python", 6449 "name": "variant_id", 6450 "description": "Variant ID generated from variant position and type", 6451 "available": True, 6452 "function_name": "calculation_variant_id", 6453 "function_params": [], 6454 }, 6455 "transcripts_json": { 6456 "type": "python", 6457 "name": "transcripts_json", 6458 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6459 "available": True, 6460 "function_name": "calculation_transcripts_annotation", 6461 "function_params": ["transcripts_json", None], 6462 }, 6463 "transcripts_ann": { 6464 "type": "python", 6465 "name": "transcripts_ann", 6466 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6467 "available": True, 6468 "function_name": "calculation_transcripts_annotation", 6469 "function_params": [None, "transcripts_ann"], 6470 }, 6471 "transcripts_annotations": { 6472 "type": "python", 6473 "name": "transcripts_annotations", 6474 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6475 "available": True, 6476 "function_name": "calculation_transcripts_annotation", 6477 "function_params": [None, None], 6478 }, 6479 "transcripts_prioritization": { 6480 "type": "python", 6481 "name": "transcripts_prioritization", 6482 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6483 "available": True, 6484 "function_name": "calculation_transcripts_prioritization", 6485 "function_params": [], 6486 }, 6487 }, 6488 "prioritizations": { 6489 "default": { 6490 "filter": [ 6491 { 6492 "type": "notequals", 6493 "value": "!PASS|\\.", 6494 "score": 0, 6495 "flag": "FILTERED", 6496 "comment": ["Bad variant quality"], 6497 }, 6498 { 6499 "type": "equals", 6500 "value": "REJECT", 6501 "score": -20, 6502 "flag": "PASS", 6503 "comment": ["Bad variant quality"], 6504 }, 6505 ], 6506 "DP": [ 6507 { 6508 "type": "gte", 6509 "value": "50", 6510 "score": 5, 6511 "flag": "PASS", 6512 "comment": ["DP higher than 50"], 6513 } 6514 ], 6515 "ANN": [ 6516 { 6517 "type": "contains", 6518 "value": "HIGH", 6519 "score": 5, 6520 "flag": "PASS", 6521 "comment": [ 6522 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6523 ], 6524 }, 6525 { 6526 "type": "contains", 6527 "value": "MODERATE", 6528 "score": 3, 6529 "flag": "PASS", 6530 "comment": [ 6531 "A non-disruptive variant that might change protein effectiveness" 6532 ], 6533 }, 6534 { 6535 "type": "contains", 6536 "value": "LOW", 6537 "score": 0, 6538 "flag": "FILTERED", 6539 "comment": [ 6540 "Assumed to be mostly harmless or unlikely to change protein behavior" 6541 ], 6542 }, 6543 { 6544 "type": "contains", 6545 "value": "MODIFIER", 6546 "score": 0, 6547 "flag": "FILTERED", 6548 "comment": [ 6549 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6550 ], 6551 }, 6552 ], 6553 } 6554 }, 6555 } 6556 6557 return config_default.get(name, None) 6558 6559 def get_config_json( 6560 self, name: str, config_dict: dict = {}, config_file: str = None 6561 ) -> dict: 6562 """ 6563 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6564 default values, a dictionary, and a file. 6565 6566 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6567 the name of the configuration. It is used to identify and retrieve the configuration settings 6568 for a specific component or module 6569 :type name: str 6570 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6571 dictionary that allows you to provide additional configuration settings or overrides. When you 6572 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6573 the key is the configuration setting you want to override or 6574 :type config_dict: dict 6575 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6576 specify the path to a configuration file that contains additional settings. If provided, the 6577 function will read the contents of this file and update the configuration dictionary with the 6578 values found in the file, overriding any existing values with the 6579 :type config_file: str 6580 :return: The function `get_config_json` returns a dictionary containing the configuration 6581 settings. 6582 """ 6583 6584 # Create with default prioritizations 6585 config_default = self.get_config_default(name=name) 6586 configuration = config_default 6587 # log.debug(f"configuration={configuration}") 6588 6589 # Replace prioritizations from dict 6590 for config in config_dict: 6591 configuration[config] = config_dict[config] 6592 6593 # Replace prioritizations from file 6594 config_file = full_path(config_file) 6595 if config_file: 6596 if os.path.exists(config_file): 6597 with open(config_file) as config_file_content: 6598 config_file_dict = json.load(config_file_content) 6599 for config in config_file_dict: 6600 configuration[config] = config_file_dict[config] 6601 else: 6602 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6603 log.error(msg_error) 6604 raise ValueError(msg_error) 6605 6606 return configuration 6607 6608 def prioritization( 6609 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6610 ) -> bool: 6611 """ 6612 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6613 prioritizes variants based on configured profiles and criteria. 6614 6615 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6616 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6617 a table name is provided, the method will prioritize the variants in that specific table 6618 :type table: str 6619 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6620 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6621 provided, the code will use a default prefix value of "PZ" 6622 :type pz_prefix: str 6623 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6624 additional parameters specific to the prioritization process. These parameters can include 6625 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6626 configurations needed for the prioritization of variants in a V 6627 :type pz_param: dict 6628 :return: A boolean value (True) is being returned from the `prioritization` function. 6629 """ 6630 6631 # Config 6632 config = self.get_config() 6633 6634 # Param 6635 param = self.get_param() 6636 6637 # Prioritization param 6638 if pz_param is not None: 6639 prioritization_param = pz_param 6640 else: 6641 prioritization_param = param.get("prioritization", {}) 6642 6643 # Configuration profiles 6644 prioritization_config_file = prioritization_param.get( 6645 "prioritization_config", None 6646 ) 6647 prioritization_config_file = full_path(prioritization_config_file) 6648 prioritizations_config = self.get_config_json( 6649 name="prioritizations", config_file=prioritization_config_file 6650 ) 6651 6652 # Prioritization prefix 6653 pz_prefix_default = "PZ" 6654 if pz_prefix is None: 6655 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6656 6657 # Prioritization options 6658 profiles = prioritization_param.get("profiles", []) 6659 if isinstance(profiles, str): 6660 profiles = profiles.split(",") 6661 pzfields = prioritization_param.get( 6662 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6663 ) 6664 if isinstance(pzfields, str): 6665 pzfields = pzfields.split(",") 6666 default_profile = prioritization_param.get("default_profile", None) 6667 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6668 prioritization_score_mode = prioritization_param.get( 6669 "prioritization_score_mode", "HOWARD" 6670 ) 6671 6672 # Quick Prioritizations 6673 prioritizations = param.get("prioritizations", None) 6674 if prioritizations: 6675 log.info("Quick Prioritization:") 6676 for profile in prioritizations.split(","): 6677 if profile not in profiles: 6678 profiles.append(profile) 6679 log.info(f" {profile}") 6680 6681 # If profile "ALL" provided, all profiles in the config profiles 6682 if "ALL" in profiles: 6683 profiles = list(prioritizations_config.keys()) 6684 6685 for profile in profiles: 6686 if prioritizations_config.get(profile, None): 6687 log.debug(f"Profile '{profile}' configured") 6688 else: 6689 msg_error = f"Profile '{profile}' NOT configured" 6690 log.error(msg_error) 6691 raise ValueError(msg_error) 6692 6693 if profiles: 6694 log.info(f"Prioritization... ") 6695 else: 6696 log.debug(f"No profile defined") 6697 return False 6698 6699 if not default_profile and len(profiles): 6700 default_profile = profiles[0] 6701 6702 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6703 log.debug("Profiles to check: " + str(list(profiles))) 6704 6705 # Variables 6706 if table is not None: 6707 table_variants = table 6708 else: 6709 table_variants = self.get_table_variants(clause="update") 6710 log.debug(f"Table to prioritize: {table_variants}") 6711 6712 # Added columns 6713 added_columns = [] 6714 6715 # Create list of PZfields 6716 # List of PZFields 6717 list_of_pzfields_original = pzfields + [ 6718 pzfield + pzfields_sep + profile 6719 for pzfield in pzfields 6720 for profile in profiles 6721 ] 6722 list_of_pzfields = [] 6723 log.debug(f"{list_of_pzfields_original}") 6724 6725 # Remove existing PZfields to use if exists 6726 for pzfield in list_of_pzfields_original: 6727 if self.get_header().infos.get(pzfield, None) is None: 6728 list_of_pzfields.append(pzfield) 6729 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6730 else: 6731 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6732 6733 if list_of_pzfields: 6734 6735 # Explode Infos prefix 6736 explode_infos_prefix = self.get_explode_infos_prefix() 6737 6738 # PZfields tags description 6739 PZfields_INFOS = { 6740 f"{pz_prefix}Tags": { 6741 "ID": f"{pz_prefix}Tags", 6742 "Number": ".", 6743 "Type": "String", 6744 "Description": "Variant tags based on annotation criteria", 6745 }, 6746 f"{pz_prefix}Score": { 6747 "ID": f"{pz_prefix}Score", 6748 "Number": 1, 6749 "Type": "Integer", 6750 "Description": "Variant score based on annotation criteria", 6751 }, 6752 f"{pz_prefix}Flag": { 6753 "ID": f"{pz_prefix}Flag", 6754 "Number": 1, 6755 "Type": "String", 6756 "Description": "Variant flag based on annotation criteria", 6757 }, 6758 f"{pz_prefix}Comment": { 6759 "ID": f"{pz_prefix}Comment", 6760 "Number": ".", 6761 "Type": "String", 6762 "Description": "Variant comment based on annotation criteria", 6763 }, 6764 f"{pz_prefix}Infos": { 6765 "ID": f"{pz_prefix}Infos", 6766 "Number": ".", 6767 "Type": "String", 6768 "Description": "Variant infos based on annotation criteria", 6769 }, 6770 } 6771 6772 # Create INFO fields if not exist 6773 for field in PZfields_INFOS: 6774 field_ID = PZfields_INFOS[field]["ID"] 6775 field_description = PZfields_INFOS[field]["Description"] 6776 if field_ID not in self.get_header().infos and field_ID in pzfields: 6777 field_description = ( 6778 PZfields_INFOS[field]["Description"] 6779 + f", profile {default_profile}" 6780 ) 6781 self.get_header().infos[field_ID] = vcf.parser._Info( 6782 field_ID, 6783 PZfields_INFOS[field]["Number"], 6784 PZfields_INFOS[field]["Type"], 6785 field_description, 6786 "unknown", 6787 "unknown", 6788 code_type_map[PZfields_INFOS[field]["Type"]], 6789 ) 6790 6791 # Create INFO fields if not exist for each profile 6792 for profile in prioritizations_config: 6793 if profile in profiles or profiles == []: 6794 for field in PZfields_INFOS: 6795 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6796 field_description = ( 6797 PZfields_INFOS[field]["Description"] 6798 + f", profile {profile}" 6799 ) 6800 if ( 6801 field_ID not in self.get_header().infos 6802 and field in pzfields 6803 ): 6804 self.get_header().infos[field_ID] = vcf.parser._Info( 6805 field_ID, 6806 PZfields_INFOS[field]["Number"], 6807 PZfields_INFOS[field]["Type"], 6808 field_description, 6809 "unknown", 6810 "unknown", 6811 code_type_map[PZfields_INFOS[field]["Type"]], 6812 ) 6813 6814 # Header 6815 for pzfield in list_of_pzfields: 6816 if re.match(f"{pz_prefix}Score.*", pzfield): 6817 added_column = self.add_column( 6818 table_name=table_variants, 6819 column_name=pzfield, 6820 column_type="INTEGER", 6821 default_value="0", 6822 ) 6823 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6824 added_column = self.add_column( 6825 table_name=table_variants, 6826 column_name=pzfield, 6827 column_type="BOOLEAN", 6828 default_value="1", 6829 ) 6830 else: 6831 added_column = self.add_column( 6832 table_name=table_variants, 6833 column_name=pzfield, 6834 column_type="STRING", 6835 default_value="''", 6836 ) 6837 added_columns.append(added_column) 6838 6839 # Profiles 6840 if profiles: 6841 6842 # foreach profile in configuration file 6843 for profile in prioritizations_config: 6844 6845 # If profile is asked in param, or ALL are asked (empty profile []) 6846 if profile in profiles or profiles == []: 6847 log.info(f"Profile '{profile}'") 6848 6849 sql_set_info_option = "" 6850 6851 sql_set_info = [] 6852 6853 # PZ fields set 6854 6855 # PZScore 6856 if ( 6857 f"{pz_prefix}Score{pzfields_sep}{profile}" 6858 in list_of_pzfields 6859 ): 6860 sql_set_info.append( 6861 f""" 6862 concat( 6863 '{pz_prefix}Score{pzfields_sep}{profile}=', 6864 {pz_prefix}Score{pzfields_sep}{profile} 6865 ) 6866 """ 6867 ) 6868 if ( 6869 profile == default_profile 6870 and f"{pz_prefix}Score" in list_of_pzfields 6871 ): 6872 sql_set_info.append( 6873 f""" 6874 concat( 6875 '{pz_prefix}Score=', 6876 {pz_prefix}Score{pzfields_sep}{profile} 6877 ) 6878 """ 6879 ) 6880 6881 # PZFlag 6882 if ( 6883 f"{pz_prefix}Flag{pzfields_sep}{profile}" 6884 in list_of_pzfields 6885 ): 6886 sql_set_info.append( 6887 f""" 6888 concat( 6889 '{pz_prefix}Flag{pzfields_sep}{profile}=', 6890 CASE 6891 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6892 THEN 'PASS' 6893 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 6894 THEN 'FILTERED' 6895 END 6896 ) 6897 """ 6898 ) 6899 if ( 6900 profile == default_profile 6901 and f"{pz_prefix}Flag" in list_of_pzfields 6902 ): 6903 sql_set_info.append( 6904 f""" 6905 concat( 6906 '{pz_prefix}Flag=', 6907 CASE 6908 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6909 THEN 'PASS' 6910 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 6911 THEN 'FILTERED' 6912 END 6913 ) 6914 """ 6915 ) 6916 6917 # PZComment 6918 if ( 6919 f"{pz_prefix}Comment{pzfields_sep}{profile}" 6920 in list_of_pzfields 6921 ): 6922 sql_set_info.append( 6923 f""" 6924 CASE 6925 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 6926 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 6927 ELSE '' 6928 END 6929 """ 6930 ) 6931 if ( 6932 profile == default_profile 6933 and f"{pz_prefix}Comment" in list_of_pzfields 6934 ): 6935 sql_set_info.append( 6936 f""" 6937 CASE 6938 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 6939 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 6940 ELSE '' 6941 END 6942 """ 6943 ) 6944 6945 # PZInfos 6946 if ( 6947 f"{pz_prefix}Infos{pzfields_sep}{profile}" 6948 in list_of_pzfields 6949 ): 6950 sql_set_info.append( 6951 f""" 6952 CASE 6953 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 6954 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 6955 ELSE '' 6956 END 6957 """ 6958 ) 6959 if ( 6960 profile == default_profile 6961 and f"{pz_prefix}Infos" in list_of_pzfields 6962 ): 6963 sql_set_info.append( 6964 f""" 6965 CASE 6966 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 6967 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 6968 ELSE '' 6969 END 6970 """ 6971 ) 6972 6973 # Merge PZfields 6974 sql_set_info_option = "" 6975 sql_set_sep = "" 6976 for sql_set in sql_set_info: 6977 if sql_set_sep: 6978 sql_set_info_option += f""" 6979 , concat('{sql_set_sep}', {sql_set}) 6980 """ 6981 else: 6982 sql_set_info_option += f""" 6983 , {sql_set} 6984 """ 6985 sql_set_sep = ";" 6986 6987 sql_queries = [] 6988 for annotation in prioritizations_config[profile]: 6989 6990 # Explode specific annotation 6991 log.debug(f"Explode annotation '{annotation}'") 6992 added_columns += self.explode_infos( 6993 prefix=explode_infos_prefix, 6994 fields=[annotation], 6995 table=table_variants, 6996 ) 6997 extra_infos = self.get_extra_infos(table=table_variants) 6998 6999 # Check if annotation field is present 7000 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 7001 log.debug(f"Annotation '{annotation}' not in data") 7002 continue 7003 else: 7004 log.debug(f"Annotation '{annotation}' in data") 7005 7006 # For each criterions 7007 for criterion in prioritizations_config[profile][ 7008 annotation 7009 ]: 7010 criterion_type = criterion["type"] 7011 criterion_value = criterion["value"] 7012 criterion_score = criterion.get("score", 0) 7013 criterion_flag = criterion.get("flag", "PASS") 7014 criterion_flag_bool = criterion_flag == "PASS" 7015 criterion_comment = ( 7016 ", ".join(criterion.get("comment", [])) 7017 .replace("'", "''") 7018 .replace(";", ",") 7019 .replace("\t", " ") 7020 ) 7021 criterion_infos = ( 7022 str(criterion) 7023 .replace("'", "''") 7024 .replace(";", ",") 7025 .replace("\t", " ") 7026 ) 7027 7028 sql_set = [] 7029 sql_set_info = [] 7030 7031 # PZ fields set 7032 if ( 7033 f"{pz_prefix}Score{pzfields_sep}{profile}" 7034 in list_of_pzfields 7035 ): 7036 if prioritization_score_mode == "HOWARD": 7037 sql_set.append( 7038 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7039 ) 7040 elif prioritization_score_mode == "VaRank": 7041 sql_set.append( 7042 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7043 ) 7044 else: 7045 sql_set.append( 7046 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7047 ) 7048 if ( 7049 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7050 in list_of_pzfields 7051 ): 7052 sql_set.append( 7053 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7054 ) 7055 if ( 7056 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7057 in list_of_pzfields 7058 ): 7059 sql_set.append( 7060 f""" 7061 {pz_prefix}Comment{pzfields_sep}{profile} = 7062 concat( 7063 {pz_prefix}Comment{pzfields_sep}{profile}, 7064 CASE 7065 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7066 THEN ', ' 7067 ELSE '' 7068 END, 7069 '{criterion_comment}' 7070 ) 7071 """ 7072 ) 7073 if ( 7074 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7075 in list_of_pzfields 7076 ): 7077 sql_set.append( 7078 f""" 7079 {pz_prefix}Infos{pzfields_sep}{profile} = 7080 concat( 7081 {pz_prefix}Infos{pzfields_sep}{profile}, 7082 '{criterion_infos}' 7083 ) 7084 """ 7085 ) 7086 sql_set_option = ",".join(sql_set) 7087 7088 # Criterion and comparison 7089 if sql_set_option: 7090 try: 7091 float(criterion_value) 7092 sql_update = f""" 7093 UPDATE {table_variants} 7094 SET {sql_set_option} 7095 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7096 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7097 """ 7098 except: 7099 contains_option = "" 7100 if criterion_type == "contains": 7101 contains_option = ".*" 7102 sql_update = f""" 7103 UPDATE {table_variants} 7104 SET {sql_set_option} 7105 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7106 """ 7107 sql_queries.append(sql_update) 7108 else: 7109 log.warning( 7110 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7111 ) 7112 7113 # PZTags 7114 if ( 7115 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7116 in list_of_pzfields 7117 ): 7118 7119 # Create PZFalgs value 7120 pztags_value = "" 7121 pztags_sep_default = "|" 7122 pztags_sep = "" 7123 for pzfield in pzfields: 7124 if pzfield not in [f"{pz_prefix}Tags"]: 7125 if ( 7126 f"{pzfield}{pzfields_sep}{profile}" 7127 in list_of_pzfields 7128 ): 7129 if pzfield in [f"{pz_prefix}Flag"]: 7130 pztags_value += f"""{pztags_sep}{pzfield}#', 7131 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7132 THEN 'PASS' 7133 ELSE 'FILTERED' 7134 END, '""" 7135 else: 7136 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7137 pztags_sep = pztags_sep_default 7138 7139 # Add Query update for PZFlags 7140 sql_update_pztags = f""" 7141 UPDATE {table_variants} 7142 SET INFO = concat( 7143 INFO, 7144 CASE WHEN INFO NOT in ('','.') 7145 THEN ';' 7146 ELSE '' 7147 END, 7148 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7149 ) 7150 """ 7151 sql_queries.append(sql_update_pztags) 7152 7153 # Add Query update for PZFlags for default 7154 if profile == default_profile: 7155 sql_update_pztags_default = f""" 7156 UPDATE {table_variants} 7157 SET INFO = concat( 7158 INFO, 7159 ';', 7160 '{pz_prefix}Tags={pztags_value}' 7161 ) 7162 """ 7163 sql_queries.append(sql_update_pztags_default) 7164 7165 log.info(f"""Profile '{profile}' - Prioritization... """) 7166 7167 if sql_queries: 7168 7169 for sql_query in sql_queries: 7170 log.debug( 7171 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7172 ) 7173 self.conn.execute(sql_query) 7174 7175 log.info(f"""Profile '{profile}' - Update... """) 7176 sql_query_update = f""" 7177 UPDATE {table_variants} 7178 SET INFO = 7179 concat( 7180 CASE 7181 WHEN INFO NOT IN ('','.') 7182 THEN concat(INFO, ';') 7183 ELSE '' 7184 END 7185 {sql_set_info_option} 7186 ) 7187 """ 7188 self.conn.execute(sql_query_update) 7189 7190 else: 7191 7192 log.warning(f"No profiles in parameters") 7193 7194 # Remove added columns 7195 for added_column in added_columns: 7196 self.drop_column(column=added_column) 7197 7198 # Explode INFOS fields into table fields 7199 if self.get_explode_infos(): 7200 self.explode_infos( 7201 prefix=self.get_explode_infos_prefix(), 7202 fields=self.get_explode_infos_fields(), 7203 force=True, 7204 ) 7205 7206 return True 7207 7208 ### 7209 # HGVS 7210 ### 7211 7212 def annotation_hgvs(self, threads: int = None) -> None: 7213 """ 7214 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7215 coordinates and alleles. 7216 7217 :param threads: The `threads` parameter is an optional integer that specifies the number of 7218 threads to use for parallel processing. If no value is provided, it will default to the number 7219 of threads obtained from the `get_threads()` method 7220 :type threads: int 7221 """ 7222 7223 # Function for each partition of the Dask Dataframe 7224 def partition_function(partition): 7225 """ 7226 The function `partition_function` applies the `annotation_hgvs_partition` function to 7227 each row of a DataFrame called `partition`. 7228 7229 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7230 to be processed 7231 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7232 the "partition" dataframe along the axis 1. 7233 """ 7234 return partition.apply(annotation_hgvs_partition, axis=1) 7235 7236 def annotation_hgvs_partition(row) -> str: 7237 """ 7238 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7239 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7240 7241 :param row: A dictionary-like object that contains the values for the following keys: 7242 :return: a string that contains the HGVS names associated with the given row of data. 7243 """ 7244 7245 chr = row["CHROM"] 7246 pos = row["POS"] 7247 ref = row["REF"] 7248 alt = row["ALT"] 7249 7250 # Find list of associated transcripts 7251 transcripts_list = list( 7252 polars_conn.execute( 7253 f""" 7254 SELECT transcript 7255 FROM refseq_df 7256 WHERE CHROM='{chr}' 7257 AND POS={pos} 7258 """ 7259 )["transcript"] 7260 ) 7261 7262 # Full HGVS annotation in list 7263 hgvs_full_list = [] 7264 7265 for transcript_name in transcripts_list: 7266 7267 # Transcript 7268 transcript = get_transcript( 7269 transcripts=transcripts, transcript_name=transcript_name 7270 ) 7271 # Exon 7272 if use_exon: 7273 exon = transcript.find_exon_number(pos) 7274 else: 7275 exon = None 7276 # Protein 7277 transcript_protein = None 7278 if use_protein or add_protein or full_format: 7279 transcripts_protein = list( 7280 polars_conn.execute( 7281 f""" 7282 SELECT protein 7283 FROM refseqlink_df 7284 WHERE transcript='{transcript_name}' 7285 LIMIT 1 7286 """ 7287 )["protein"] 7288 ) 7289 if len(transcripts_protein): 7290 transcript_protein = transcripts_protein[0] 7291 7292 # HGVS name 7293 hgvs_name = format_hgvs_name( 7294 chr, 7295 pos, 7296 ref, 7297 alt, 7298 genome=genome, 7299 transcript=transcript, 7300 transcript_protein=transcript_protein, 7301 exon=exon, 7302 use_gene=use_gene, 7303 use_protein=use_protein, 7304 full_format=full_format, 7305 use_version=use_version, 7306 codon_type=codon_type, 7307 ) 7308 hgvs_full_list.append(hgvs_name) 7309 if add_protein and not use_protein and not full_format: 7310 hgvs_name = format_hgvs_name( 7311 chr, 7312 pos, 7313 ref, 7314 alt, 7315 genome=genome, 7316 transcript=transcript, 7317 transcript_protein=transcript_protein, 7318 exon=exon, 7319 use_gene=use_gene, 7320 use_protein=True, 7321 full_format=False, 7322 use_version=use_version, 7323 codon_type=codon_type, 7324 ) 7325 hgvs_full_list.append(hgvs_name) 7326 7327 # Create liste of HGVS annotations 7328 hgvs_full = ",".join(hgvs_full_list) 7329 7330 return hgvs_full 7331 7332 # Polars connexion 7333 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7334 7335 # Config 7336 config = self.get_config() 7337 7338 # Databases 7339 # Genome 7340 databases_genomes_folders = ( 7341 config.get("folders", {}) 7342 .get("databases", {}) 7343 .get("genomes", DEFAULT_GENOME_FOLDER) 7344 ) 7345 databases_genome = ( 7346 config.get("folders", {}).get("databases", {}).get("genomes", "") 7347 ) 7348 # refseq database folder 7349 databases_refseq_folders = ( 7350 config.get("folders", {}) 7351 .get("databases", {}) 7352 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7353 ) 7354 # refseq 7355 databases_refseq = config.get("databases", {}).get("refSeq", None) 7356 # refSeqLink 7357 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7358 7359 # Param 7360 param = self.get_param() 7361 7362 # Quick HGVS 7363 if "hgvs_options" in param and param.get("hgvs_options", ""): 7364 log.info(f"Quick HGVS Annotation:") 7365 if not param.get("hgvs", None): 7366 param["hgvs"] = {} 7367 for option in param.get("hgvs_options", "").split(","): 7368 option_var_val = option.split("=") 7369 option_var = option_var_val[0] 7370 if len(option_var_val) > 1: 7371 option_val = option_var_val[1] 7372 else: 7373 option_val = "True" 7374 if option_val.upper() in ["TRUE"]: 7375 option_val = True 7376 elif option_val.upper() in ["FALSE"]: 7377 option_val = False 7378 log.info(f" {option_var}={option_val}") 7379 param["hgvs"][option_var] = option_val 7380 7381 # Check if HGVS annotation enabled 7382 if "hgvs" in param: 7383 log.info(f"HGVS Annotation... ") 7384 for hgvs_option in param.get("hgvs", {}): 7385 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7386 else: 7387 return 7388 7389 # HGVS Param 7390 param_hgvs = param.get("hgvs", {}) 7391 use_exon = param_hgvs.get("use_exon", False) 7392 use_gene = param_hgvs.get("use_gene", False) 7393 use_protein = param_hgvs.get("use_protein", False) 7394 add_protein = param_hgvs.get("add_protein", False) 7395 full_format = param_hgvs.get("full_format", False) 7396 use_version = param_hgvs.get("use_version", False) 7397 codon_type = param_hgvs.get("codon_type", "3") 7398 7399 # refSseq refSeqLink 7400 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7401 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7402 7403 # Assembly 7404 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7405 7406 # Genome 7407 genome_file = None 7408 if find_genome(databases_genome): 7409 genome_file = find_genome(databases_genome) 7410 else: 7411 genome_file = find_genome( 7412 genome_path=databases_genomes_folders, assembly=assembly 7413 ) 7414 log.debug("Genome: " + str(genome_file)) 7415 7416 # refSseq 7417 refseq_file = find_file_prefix( 7418 input_file=databases_refseq, 7419 prefix="ncbiRefSeq", 7420 folder=databases_refseq_folders, 7421 assembly=assembly, 7422 ) 7423 log.debug("refSeq: " + str(refseq_file)) 7424 7425 # refSeqLink 7426 refseqlink_file = find_file_prefix( 7427 input_file=databases_refseqlink, 7428 prefix="ncbiRefSeqLink", 7429 folder=databases_refseq_folders, 7430 assembly=assembly, 7431 ) 7432 log.debug("refSeqLink: " + str(refseqlink_file)) 7433 7434 # Threads 7435 if not threads: 7436 threads = self.get_threads() 7437 log.debug("Threads: " + str(threads)) 7438 7439 # Variables 7440 table_variants = self.get_table_variants(clause="update") 7441 7442 # Get variants SNV and InDel only 7443 query_variants = f""" 7444 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7445 FROM {table_variants} 7446 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7447 """ 7448 df_variants = self.get_query_to_df(query_variants) 7449 7450 # Added columns 7451 added_columns = [] 7452 7453 # Add hgvs column in variants table 7454 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7455 added_column = self.add_column( 7456 table_variants, hgvs_column_name, "STRING", default_value=None 7457 ) 7458 added_columns.append(added_column) 7459 7460 log.debug(f"refSeq loading...") 7461 # refSeq in duckDB 7462 refseq_table = get_refseq_table( 7463 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7464 ) 7465 # Loading all refSeq in Dataframe 7466 refseq_query = f""" 7467 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7468 FROM {refseq_table} 7469 JOIN df_variants ON ( 7470 {refseq_table}.chrom = df_variants.CHROM 7471 AND {refseq_table}.txStart<=df_variants.POS 7472 AND {refseq_table}.txEnd>=df_variants.POS 7473 ) 7474 """ 7475 refseq_df = self.conn.query(refseq_query).pl() 7476 7477 if refseqlink_file: 7478 log.debug(f"refSeqLink loading...") 7479 # refSeqLink in duckDB 7480 refseqlink_table = get_refseq_table( 7481 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7482 ) 7483 # Loading all refSeqLink in Dataframe 7484 protacc_column = "protAcc_with_ver" 7485 mrnaacc_column = "mrnaAcc_with_ver" 7486 refseqlink_query = f""" 7487 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7488 FROM {refseqlink_table} 7489 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7490 WHERE protAcc_without_ver IS NOT NULL 7491 """ 7492 # Polars Dataframe 7493 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7494 7495 # Read RefSeq transcripts into a python dict/model. 7496 log.debug(f"Transcripts loading...") 7497 with tempfile.TemporaryDirectory() as tmpdir: 7498 transcripts_query = f""" 7499 COPY ( 7500 SELECT {refseq_table}.* 7501 FROM {refseq_table} 7502 JOIN df_variants ON ( 7503 {refseq_table}.chrom=df_variants.CHROM 7504 AND {refseq_table}.txStart<=df_variants.POS 7505 AND {refseq_table}.txEnd>=df_variants.POS 7506 ) 7507 ) 7508 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7509 """ 7510 self.conn.query(transcripts_query) 7511 with open(f"{tmpdir}/transcript.tsv") as infile: 7512 transcripts = read_transcripts(infile) 7513 7514 # Polars connexion 7515 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7516 7517 log.debug("Genome loading...") 7518 # Read genome sequence using pyfaidx. 7519 genome = Fasta(genome_file) 7520 7521 log.debug("Start annotation HGVS...") 7522 7523 # Create 7524 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7525 ddf = dd.from_pandas(df_variants, npartitions=threads) 7526 7527 # Use dask.dataframe.apply() to apply function on each partition 7528 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7529 7530 # Convert Dask DataFrame to Pandas Dataframe 7531 df = ddf.compute() 7532 7533 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7534 with tempfile.TemporaryDirectory() as tmpdir: 7535 df_parquet = os.path.join(tmpdir, "df.parquet") 7536 df.to_parquet(df_parquet) 7537 7538 # Update hgvs column 7539 update_variant_query = f""" 7540 UPDATE {table_variants} 7541 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7542 FROM read_parquet('{df_parquet}') as df 7543 WHERE variants."#CHROM" = df.CHROM 7544 AND variants.POS = df.POS 7545 AND variants.REF = df.REF 7546 AND variants.ALT = df.ALT 7547 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7548 """ 7549 self.execute_query(update_variant_query) 7550 7551 # Update INFO column 7552 sql_query_update = f""" 7553 UPDATE {table_variants} 7554 SET INFO = 7555 concat( 7556 CASE 7557 WHEN INFO NOT IN ('','.') 7558 THEN concat(INFO, ';') 7559 ELSE '' 7560 END, 7561 'hgvs=', 7562 {hgvs_column_name} 7563 ) 7564 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7565 """ 7566 self.execute_query(sql_query_update) 7567 7568 # Add header 7569 HGVS_INFOS = { 7570 "hgvs": { 7571 "ID": "hgvs", 7572 "Number": ".", 7573 "Type": "String", 7574 "Description": f"HGVS annotatation with HOWARD", 7575 } 7576 } 7577 7578 for field in HGVS_INFOS: 7579 field_ID = HGVS_INFOS[field]["ID"] 7580 field_description = HGVS_INFOS[field]["Description"] 7581 self.get_header().infos[field_ID] = vcf.parser._Info( 7582 field_ID, 7583 HGVS_INFOS[field]["Number"], 7584 HGVS_INFOS[field]["Type"], 7585 field_description, 7586 "unknown", 7587 "unknown", 7588 code_type_map[HGVS_INFOS[field]["Type"]], 7589 ) 7590 7591 # Remove added columns 7592 for added_column in added_columns: 7593 self.drop_column(column=added_column) 7594 7595 ### 7596 # Calculation 7597 ### 7598 7599 def get_operations_help( 7600 self, operations_config_dict: dict = {}, operations_config_file: str = None 7601 ) -> list: 7602 7603 # Init 7604 operations_help = [] 7605 7606 # operations 7607 operations = self.get_config_json( 7608 name="calculations", 7609 config_dict=operations_config_dict, 7610 config_file=operations_config_file, 7611 ) 7612 for op in operations: 7613 op_name = operations[op].get("name", op).upper() 7614 op_description = operations[op].get("description", op_name) 7615 op_available = operations[op].get("available", False) 7616 if op_available: 7617 operations_help.append(f" {op_name}: {op_description}") 7618 7619 # Sort operations 7620 operations_help.sort() 7621 7622 # insert header 7623 operations_help.insert(0, "Available calculation operations:") 7624 7625 # Return 7626 return operations_help 7627 7628 def calculation( 7629 self, 7630 operations: dict = {}, 7631 operations_config_dict: dict = {}, 7632 operations_config_file: str = None, 7633 ) -> None: 7634 """ 7635 It takes a list of operations, and for each operation, it checks if it's a python or sql 7636 operation, and then calls the appropriate function 7637 7638 param json example: 7639 "calculation": { 7640 "NOMEN": { 7641 "options": { 7642 "hgvs_field": "hgvs" 7643 }, 7644 "middle" : null 7645 } 7646 """ 7647 7648 # Param 7649 param = self.get_param() 7650 7651 # operations config 7652 operations_config = self.get_config_json( 7653 name="calculations", 7654 config_dict=operations_config_dict, 7655 config_file=operations_config_file, 7656 ) 7657 7658 # Upper keys 7659 operations_config = {k.upper(): v for k, v in operations_config.items()} 7660 7661 # Calculations 7662 7663 # Operations from param 7664 operations = param.get("calculation", {}).get("calculations", operations) 7665 7666 # Quick calculation - add 7667 if param.get("calculations", None): 7668 calculations_list = [ 7669 value for value in param.get("calculations", "").split(",") 7670 ] 7671 log.info(f"Quick Calculations:") 7672 for calculation_key in calculations_list: 7673 log.info(f" {calculation_key}") 7674 for calculation_operation in calculations_list: 7675 if calculation_operation.upper() not in operations: 7676 operations[calculation_operation.upper()] = {} 7677 add_value_into_dict( 7678 dict_tree=param, 7679 sections=[ 7680 "calculation", 7681 "calculations", 7682 calculation_operation.upper(), 7683 ], 7684 value={}, 7685 ) 7686 7687 # Operations for calculation 7688 if not operations: 7689 operations = param.get("calculation", {}).get("calculations", {}) 7690 7691 if operations: 7692 log.info(f"Calculations...") 7693 7694 # For each operations 7695 for operation_name in operations: 7696 operation_name = operation_name.upper() 7697 if operation_name not in [""]: 7698 if operation_name in operations_config: 7699 log.info(f"Calculation '{operation_name}'") 7700 operation = operations_config[operation_name] 7701 operation_type = operation.get("type", "sql") 7702 if operation_type == "python": 7703 self.calculation_process_function( 7704 operation=operation, operation_name=operation_name 7705 ) 7706 elif operation_type == "sql": 7707 self.calculation_process_sql( 7708 operation=operation, operation_name=operation_name 7709 ) 7710 else: 7711 log.error( 7712 f"Operations config: Type '{operation_type}' NOT available" 7713 ) 7714 raise ValueError( 7715 f"Operations config: Type '{operation_type}' NOT available" 7716 ) 7717 else: 7718 log.error( 7719 f"Operations config: Calculation '{operation_name}' NOT available" 7720 ) 7721 raise ValueError( 7722 f"Operations config: Calculation '{operation_name}' NOT available" 7723 ) 7724 7725 # Explode INFOS fields into table fields 7726 if self.get_explode_infos(): 7727 self.explode_infos( 7728 prefix=self.get_explode_infos_prefix(), 7729 fields=self.get_explode_infos_fields(), 7730 force=True, 7731 ) 7732 7733 def calculation_process_sql( 7734 self, operation: dict, operation_name: str = "unknown" 7735 ) -> None: 7736 """ 7737 The `calculation_process_sql` function takes in a mathematical operation as a string and 7738 performs the operation, updating the specified table with the result. 7739 7740 :param operation: The `operation` parameter is a dictionary that contains information about the 7741 mathematical operation to be performed. It includes the following keys: 7742 :type operation: dict 7743 :param operation_name: The `operation_name` parameter is a string that represents the name of 7744 the mathematical operation being performed. It is used for logging and error handling purposes, 7745 defaults to unknown 7746 :type operation_name: str (optional) 7747 """ 7748 7749 # table variants 7750 table_variants = self.get_table_variants(clause="alter") 7751 7752 # Operation infos 7753 operation_name = operation.get("name", "unknown") 7754 log.debug(f"process sql {operation_name}") 7755 output_column_name = operation.get("output_column_name", operation_name) 7756 output_column_type = operation.get("output_column_type", "String") 7757 prefix = operation.get("explode_infos_prefix", "") 7758 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7759 output_column_description = operation.get( 7760 "output_column_description", f"{operation_name} operation" 7761 ) 7762 operation_query = operation.get("operation_query", None) 7763 if isinstance(operation_query, list): 7764 operation_query = " ".join(operation_query) 7765 operation_info_fields = operation.get("info_fields", []) 7766 operation_info_fields_check = operation.get("info_fields_check", False) 7767 operation_info = operation.get("operation_info", True) 7768 7769 if operation_query: 7770 7771 # Info fields check 7772 operation_info_fields_check_result = True 7773 if operation_info_fields_check: 7774 header_infos = self.get_header().infos 7775 for info_field in operation_info_fields: 7776 operation_info_fields_check_result = ( 7777 operation_info_fields_check_result 7778 and info_field in header_infos 7779 ) 7780 7781 # If info fields available 7782 if operation_info_fields_check_result: 7783 7784 # Added_columns 7785 added_columns = [] 7786 7787 # Create VCF header field 7788 vcf_reader = self.get_header() 7789 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7790 output_column_name, 7791 ".", 7792 output_column_type, 7793 output_column_description, 7794 "howard calculation", 7795 "0", 7796 self.code_type_map.get(output_column_type), 7797 ) 7798 7799 # Explode infos if needed 7800 log.debug(f"calculation_process_sql prefix {prefix}") 7801 added_columns += self.explode_infos( 7802 prefix=prefix, 7803 fields=[output_column_name] + operation_info_fields, 7804 force=True, 7805 ) 7806 7807 # Create column 7808 added_column = self.add_column( 7809 table_name=table_variants, 7810 column_name=prefix + output_column_name, 7811 column_type=output_column_type_sql, 7812 default_value="null", 7813 ) 7814 added_columns.append(added_column) 7815 7816 # Operation calculation 7817 try: 7818 7819 # Query to update calculation column 7820 sql_update = f""" 7821 UPDATE {table_variants} 7822 SET "{prefix}{output_column_name}" = ({operation_query}) 7823 """ 7824 self.conn.execute(sql_update) 7825 7826 # Add to INFO 7827 if operation_info: 7828 sql_update_info = f""" 7829 UPDATE {table_variants} 7830 SET "INFO" = 7831 concat( 7832 CASE 7833 WHEN "INFO" IS NOT NULL 7834 THEN concat("INFO", ';') 7835 ELSE '' 7836 END, 7837 '{output_column_name}=', 7838 "{prefix}{output_column_name}" 7839 ) 7840 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7841 """ 7842 self.conn.execute(sql_update_info) 7843 7844 except: 7845 log.error( 7846 f"Operations config: Calculation '{operation_name}' query failed" 7847 ) 7848 raise ValueError( 7849 f"Operations config: Calculation '{operation_name}' query failed" 7850 ) 7851 7852 # Remove added columns 7853 for added_column in added_columns: 7854 log.debug(f"added_column: {added_column}") 7855 self.drop_column(column=added_column) 7856 7857 else: 7858 log.error( 7859 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7860 ) 7861 raise ValueError( 7862 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7863 ) 7864 7865 else: 7866 log.error( 7867 f"Operations config: Calculation '{operation_name}' query NOT defined" 7868 ) 7869 raise ValueError( 7870 f"Operations config: Calculation '{operation_name}' query NOT defined" 7871 ) 7872 7873 def calculation_process_function( 7874 self, operation: dict, operation_name: str = "unknown" 7875 ) -> None: 7876 """ 7877 The `calculation_process_function` takes in an operation dictionary and performs the specified 7878 function with the given parameters. 7879 7880 :param operation: The `operation` parameter is a dictionary that contains information about the 7881 operation to be performed. It has the following keys: 7882 :type operation: dict 7883 :param operation_name: The `operation_name` parameter is a string that represents the name of 7884 the operation being performed. It is used for logging purposes, defaults to unknown 7885 :type operation_name: str (optional) 7886 """ 7887 7888 operation_name = operation["name"] 7889 log.debug(f"process sql {operation_name}") 7890 function_name = operation["function_name"] 7891 function_params = operation["function_params"] 7892 getattr(self, function_name)(*function_params) 7893 7894 def calculation_variant_id(self) -> None: 7895 """ 7896 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7897 updates the INFO field of a variants table with the variant ID. 7898 """ 7899 7900 # variant_id annotation field 7901 variant_id_tag = self.get_variant_id_column() 7902 added_columns = [variant_id_tag] 7903 7904 # variant_id hgvs tags" 7905 vcf_infos_tags = { 7906 variant_id_tag: "howard variant ID annotation", 7907 } 7908 7909 # Variants table 7910 table_variants = self.get_table_variants() 7911 7912 # Header 7913 vcf_reader = self.get_header() 7914 7915 # Add variant_id to header 7916 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7917 variant_id_tag, 7918 ".", 7919 "String", 7920 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7921 "howard calculation", 7922 "0", 7923 self.code_type_map.get("String"), 7924 ) 7925 7926 # Update 7927 sql_update = f""" 7928 UPDATE {table_variants} 7929 SET "INFO" = 7930 concat( 7931 CASE 7932 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7933 THEN '' 7934 ELSE concat("INFO", ';') 7935 END, 7936 '{variant_id_tag}=', 7937 "{variant_id_tag}" 7938 ) 7939 """ 7940 self.conn.execute(sql_update) 7941 7942 # Remove added columns 7943 for added_column in added_columns: 7944 self.drop_column(column=added_column) 7945 7946 def calculation_extract_snpeff_hgvs( 7947 self, 7948 snpeff_hgvs: str = "snpeff_hgvs", 7949 snpeff_field: str = "ANN", 7950 ) -> None: 7951 """ 7952 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7953 annotation field in a VCF file and adds them as a new column in the variants table. 7954 7955 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 7956 function is used to specify the name of the column that will store the HGVS nomenclatures 7957 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 7958 snpeff_hgvs 7959 :type snpeff_hgvs: str (optional) 7960 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 7961 function represents the field in the VCF file that contains SnpEff annotations. This field is 7962 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 7963 to ANN 7964 :type snpeff_field: str (optional) 7965 """ 7966 7967 # Snpeff hgvs tags 7968 vcf_infos_tags = { 7969 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7970 } 7971 7972 # Prefix 7973 prefix = self.get_explode_infos_prefix() 7974 if prefix: 7975 prefix = "INFO/" 7976 7977 # snpEff fields 7978 speff_ann_infos = prefix + snpeff_field 7979 speff_hgvs_infos = prefix + snpeff_hgvs 7980 7981 # Variants table 7982 table_variants = self.get_table_variants() 7983 7984 # Header 7985 vcf_reader = self.get_header() 7986 7987 # Add columns 7988 added_columns = [] 7989 7990 # Explode HGVS field in column 7991 added_columns += self.explode_infos(fields=[snpeff_field]) 7992 7993 if snpeff_field in vcf_reader.infos: 7994 7995 log.debug(vcf_reader.infos[snpeff_field]) 7996 7997 # Extract ANN header 7998 ann_description = vcf_reader.infos[snpeff_field].desc 7999 pattern = r"'(.+?)'" 8000 match = re.search(pattern, ann_description) 8001 if match: 8002 ann_header_match = match.group(1).split(" | ") 8003 ann_header_desc = {} 8004 for i in range(len(ann_header_match)): 8005 ann_header_info = "".join( 8006 char for char in ann_header_match[i] if char.isalnum() 8007 ) 8008 ann_header_desc[ann_header_info] = ann_header_match[i] 8009 if not ann_header_desc: 8010 raise ValueError("Invalid header description format") 8011 else: 8012 raise ValueError("Invalid header description format") 8013 8014 # Create variant id 8015 variant_id_column = self.get_variant_id_column() 8016 added_columns += [variant_id_column] 8017 8018 # Create dataframe 8019 dataframe_snpeff_hgvs = self.get_query_to_df( 8020 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8021 ) 8022 8023 # Create main NOMEN column 8024 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8025 speff_ann_infos 8026 ].apply( 8027 lambda x: extract_snpeff_hgvs( 8028 str(x), header=list(ann_header_desc.values()) 8029 ) 8030 ) 8031 8032 # Add snpeff_hgvs to header 8033 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8034 snpeff_hgvs, 8035 ".", 8036 "String", 8037 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8038 "howard calculation", 8039 "0", 8040 self.code_type_map.get("String"), 8041 ) 8042 8043 # Update 8044 sql_update = f""" 8045 UPDATE variants 8046 SET "INFO" = 8047 concat( 8048 CASE 8049 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8050 THEN '' 8051 ELSE concat("INFO", ';') 8052 END, 8053 CASE 8054 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8055 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8056 THEN concat( 8057 '{snpeff_hgvs}=', 8058 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8059 ) 8060 ELSE '' 8061 END 8062 ) 8063 FROM dataframe_snpeff_hgvs 8064 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8065 8066 """ 8067 self.conn.execute(sql_update) 8068 8069 # Delete dataframe 8070 del dataframe_snpeff_hgvs 8071 gc.collect() 8072 8073 else: 8074 8075 log.warning( 8076 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8077 ) 8078 8079 # Remove added columns 8080 for added_column in added_columns: 8081 self.drop_column(column=added_column) 8082 8083 def calculation_snpeff_ann_explode( 8084 self, 8085 uniquify: bool = True, 8086 output_format: str = "fields", 8087 output_prefix: str = "snpeff_", 8088 snpeff_field: str = "ANN", 8089 ) -> None: 8090 """ 8091 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8092 exploding the HGVS field and updating variant information accordingly. 8093 8094 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8095 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8096 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8097 defaults to True 8098 :type uniquify: bool (optional) 8099 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8100 function specifies the format in which the output annotations will be generated. It has a 8101 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8102 format, defaults to fields 8103 :type output_format: str (optional) 8104 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8105 method is used to specify the prefix that will be added to the output annotations generated 8106 during the calculation process. This prefix helps to differentiate the newly added annotations 8107 from existing ones in the output data. By default, the, defaults to ANN_ 8108 :type output_prefix: str (optional) 8109 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8110 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8111 field will be processed to explode the HGVS annotations and update the variant information 8112 accordingly, defaults to ANN 8113 :type snpeff_field: str (optional) 8114 """ 8115 8116 # SnpEff annotation field 8117 snpeff_hgvs = "snpeff_ann_explode" 8118 8119 # Snpeff hgvs tags 8120 vcf_infos_tags = { 8121 snpeff_hgvs: "Explode snpEff annotations", 8122 } 8123 8124 # Prefix 8125 prefix = self.get_explode_infos_prefix() 8126 if prefix: 8127 prefix = "INFO/" 8128 8129 # snpEff fields 8130 speff_ann_infos = prefix + snpeff_field 8131 speff_hgvs_infos = prefix + snpeff_hgvs 8132 8133 # Variants table 8134 table_variants = self.get_table_variants() 8135 8136 # Header 8137 vcf_reader = self.get_header() 8138 8139 # Add columns 8140 added_columns = [] 8141 8142 # Explode HGVS field in column 8143 added_columns += self.explode_infos(fields=[snpeff_field]) 8144 log.debug(f"snpeff_field={snpeff_field}") 8145 log.debug(f"added_columns={added_columns}") 8146 8147 if snpeff_field in vcf_reader.infos: 8148 8149 # Extract ANN header 8150 ann_description = vcf_reader.infos[snpeff_field].desc 8151 pattern = r"'(.+?)'" 8152 match = re.search(pattern, ann_description) 8153 if match: 8154 ann_header_match = match.group(1).split(" | ") 8155 ann_header = [] 8156 ann_header_desc = {} 8157 for i in range(len(ann_header_match)): 8158 ann_header_info = "".join( 8159 char for char in ann_header_match[i] if char.isalnum() 8160 ) 8161 ann_header.append(ann_header_info) 8162 ann_header_desc[ann_header_info] = ann_header_match[i] 8163 if not ann_header_desc: 8164 raise ValueError("Invalid header description format") 8165 else: 8166 raise ValueError("Invalid header description format") 8167 8168 # Create variant id 8169 variant_id_column = self.get_variant_id_column() 8170 added_columns += [variant_id_column] 8171 8172 # Create dataframe 8173 dataframe_snpeff_hgvs = self.get_query_to_df( 8174 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8175 ) 8176 8177 # Create snpEff columns 8178 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8179 speff_ann_infos 8180 ].apply( 8181 lambda x: explode_snpeff_ann( 8182 str(x), 8183 uniquify=uniquify, 8184 output_format=output_format, 8185 prefix=output_prefix, 8186 header=list(ann_header_desc.values()), 8187 ) 8188 ) 8189 8190 # Header 8191 ann_annotations_prefix = "" 8192 if output_format.upper() in ["JSON"]: 8193 ann_annotations_prefix = f"{output_prefix}=" 8194 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8195 output_prefix, 8196 ".", 8197 "String", 8198 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8199 + " - JSON format", 8200 "howard calculation", 8201 "0", 8202 self.code_type_map.get("String"), 8203 ) 8204 else: 8205 for ann_annotation in ann_header: 8206 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8207 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8208 ann_annotation_id, 8209 ".", 8210 "String", 8211 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8212 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8213 "howard calculation", 8214 "0", 8215 self.code_type_map.get("String"), 8216 ) 8217 8218 # Update 8219 sql_update = f""" 8220 UPDATE variants 8221 SET "INFO" = 8222 concat( 8223 CASE 8224 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8225 THEN '' 8226 ELSE concat("INFO", ';') 8227 END, 8228 CASE 8229 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8230 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8231 THEN concat( 8232 '{ann_annotations_prefix}', 8233 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8234 ) 8235 ELSE '' 8236 END 8237 ) 8238 FROM dataframe_snpeff_hgvs 8239 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8240 8241 """ 8242 self.conn.execute(sql_update) 8243 8244 # Delete dataframe 8245 del dataframe_snpeff_hgvs 8246 gc.collect() 8247 8248 else: 8249 8250 log.warning( 8251 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8252 ) 8253 8254 # Remove added columns 8255 for added_column in added_columns: 8256 self.drop_column(column=added_column) 8257 8258 def calculation_extract_nomen(self) -> None: 8259 """ 8260 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8261 """ 8262 8263 # NOMEN field 8264 field_nomen_dict = "NOMEN_DICT" 8265 8266 # NOMEN structure 8267 nomen_dict = { 8268 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8269 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8270 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8271 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8272 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8273 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8274 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8275 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8276 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8277 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8278 } 8279 8280 # Param 8281 param = self.get_param() 8282 8283 # Prefix 8284 prefix = self.get_explode_infos_prefix() 8285 8286 # Header 8287 vcf_reader = self.get_header() 8288 8289 # Get HGVS field 8290 hgvs_field = ( 8291 param.get("calculation", {}) 8292 .get("calculations", {}) 8293 .get("NOMEN", {}) 8294 .get("options", {}) 8295 .get("hgvs_field", "hgvs") 8296 ) 8297 8298 # Get transcripts 8299 transcripts_file = ( 8300 param.get("calculation", {}) 8301 .get("calculations", {}) 8302 .get("NOMEN", {}) 8303 .get("options", {}) 8304 .get("transcripts", None) 8305 ) 8306 transcripts_file = full_path(transcripts_file) 8307 transcripts = [] 8308 if transcripts_file: 8309 if os.path.exists(transcripts_file): 8310 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8311 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8312 else: 8313 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8314 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8315 8316 # Added columns 8317 added_columns = [] 8318 8319 # Explode HGVS field in column 8320 added_columns += self.explode_infos(fields=[hgvs_field]) 8321 8322 # extra infos 8323 extra_infos = self.get_extra_infos() 8324 extra_field = prefix + hgvs_field 8325 8326 if extra_field in extra_infos: 8327 8328 # Create dataframe 8329 dataframe_hgvs = self.get_query_to_df( 8330 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8331 ) 8332 8333 # Create main NOMEN column 8334 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8335 lambda x: find_nomen(str(x), transcripts=transcripts) 8336 ) 8337 8338 # Explode NOMEN Structure and create SQL set for update 8339 sql_nomen_fields = [] 8340 for nomen_field in nomen_dict: 8341 8342 # Explode each field into a column 8343 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8344 lambda x: dict(x).get(nomen_field, "") 8345 ) 8346 8347 # Create VCF header field 8348 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8349 nomen_field, 8350 ".", 8351 "String", 8352 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8353 "howard calculation", 8354 "0", 8355 self.code_type_map.get("String"), 8356 ) 8357 sql_nomen_fields.append( 8358 f""" 8359 CASE 8360 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8361 THEN concat( 8362 ';{nomen_field}=', 8363 dataframe_hgvs."{nomen_field}" 8364 ) 8365 ELSE '' 8366 END 8367 """ 8368 ) 8369 8370 # SQL set for update 8371 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8372 8373 # Update 8374 sql_update = f""" 8375 UPDATE variants 8376 SET "INFO" = 8377 concat( 8378 CASE 8379 WHEN "INFO" IS NULL 8380 THEN '' 8381 ELSE "INFO" 8382 END, 8383 {sql_nomen_fields_set} 8384 ) 8385 FROM dataframe_hgvs 8386 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8387 AND variants."POS" = dataframe_hgvs."POS" 8388 AND variants."REF" = dataframe_hgvs."REF" 8389 AND variants."ALT" = dataframe_hgvs."ALT" 8390 """ 8391 self.conn.execute(sql_update) 8392 8393 # Delete dataframe 8394 del dataframe_hgvs 8395 gc.collect() 8396 8397 # Remove added columns 8398 for added_column in added_columns: 8399 self.drop_column(column=added_column) 8400 8401 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8402 """ 8403 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8404 pipeline/sample for a variant and updates the variant information in a VCF file. 8405 8406 :param tag: The `tag` parameter is a string that represents the annotation field for the 8407 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8408 VCF header and to update the corresponding field in the variants table, defaults to 8409 findbypipeline 8410 :type tag: str (optional) 8411 """ 8412 8413 # if FORMAT and samples 8414 if ( 8415 "FORMAT" in self.get_header_columns_as_list() 8416 and self.get_header_sample_list() 8417 ): 8418 8419 # findbypipeline annotation field 8420 findbypipeline_tag = tag 8421 8422 # VCF infos tags 8423 vcf_infos_tags = { 8424 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8425 } 8426 8427 # Prefix 8428 prefix = self.get_explode_infos_prefix() 8429 8430 # Field 8431 findbypipeline_infos = prefix + findbypipeline_tag 8432 8433 # Variants table 8434 table_variants = self.get_table_variants() 8435 8436 # Header 8437 vcf_reader = self.get_header() 8438 8439 # Create variant id 8440 variant_id_column = self.get_variant_id_column() 8441 added_columns = [variant_id_column] 8442 8443 # variant_id, FORMAT and samples 8444 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8445 self.get_header_sample_list() 8446 ) 8447 8448 # Create dataframe 8449 dataframe_findbypipeline = self.get_query_to_df( 8450 f""" SELECT {samples_fields} FROM {table_variants} """ 8451 ) 8452 8453 # Create findbypipeline column 8454 dataframe_findbypipeline[findbypipeline_infos] = ( 8455 dataframe_findbypipeline.apply( 8456 lambda row: findbypipeline( 8457 row, samples=self.get_header_sample_list() 8458 ), 8459 axis=1, 8460 ) 8461 ) 8462 8463 # Add snpeff_hgvs to header 8464 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8465 findbypipeline_tag, 8466 ".", 8467 "String", 8468 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8469 "howard calculation", 8470 "0", 8471 self.code_type_map.get("String"), 8472 ) 8473 8474 # Update 8475 sql_update = f""" 8476 UPDATE variants 8477 SET "INFO" = 8478 concat( 8479 CASE 8480 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8481 THEN '' 8482 ELSE concat("INFO", ';') 8483 END, 8484 CASE 8485 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8486 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8487 THEN concat( 8488 '{findbypipeline_tag}=', 8489 dataframe_findbypipeline."{findbypipeline_infos}" 8490 ) 8491 ELSE '' 8492 END 8493 ) 8494 FROM dataframe_findbypipeline 8495 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8496 """ 8497 self.conn.execute(sql_update) 8498 8499 # Remove added columns 8500 for added_column in added_columns: 8501 self.drop_column(column=added_column) 8502 8503 # Delete dataframe 8504 del dataframe_findbypipeline 8505 gc.collect() 8506 8507 def calculation_genotype_concordance(self) -> None: 8508 """ 8509 The function `calculation_genotype_concordance` calculates the genotype concordance for 8510 multi-caller VCF files and updates the variant information in the database. 8511 """ 8512 8513 # if FORMAT and samples 8514 if ( 8515 "FORMAT" in self.get_header_columns_as_list() 8516 and self.get_header_sample_list() 8517 ): 8518 8519 # genotypeconcordance annotation field 8520 genotypeconcordance_tag = "genotypeconcordance" 8521 8522 # VCF infos tags 8523 vcf_infos_tags = { 8524 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8525 } 8526 8527 # Prefix 8528 prefix = self.get_explode_infos_prefix() 8529 8530 # Field 8531 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8532 8533 # Variants table 8534 table_variants = self.get_table_variants() 8535 8536 # Header 8537 vcf_reader = self.get_header() 8538 8539 # Create variant id 8540 variant_id_column = self.get_variant_id_column() 8541 added_columns = [variant_id_column] 8542 8543 # variant_id, FORMAT and samples 8544 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8545 self.get_header_sample_list() 8546 ) 8547 8548 # Create dataframe 8549 dataframe_genotypeconcordance = self.get_query_to_df( 8550 f""" SELECT {samples_fields} FROM {table_variants} """ 8551 ) 8552 8553 # Create genotypeconcordance column 8554 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8555 dataframe_genotypeconcordance.apply( 8556 lambda row: genotypeconcordance( 8557 row, samples=self.get_header_sample_list() 8558 ), 8559 axis=1, 8560 ) 8561 ) 8562 8563 # Add genotypeconcordance to header 8564 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8565 genotypeconcordance_tag, 8566 ".", 8567 "String", 8568 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8569 "howard calculation", 8570 "0", 8571 self.code_type_map.get("String"), 8572 ) 8573 8574 # Update 8575 sql_update = f""" 8576 UPDATE variants 8577 SET "INFO" = 8578 concat( 8579 CASE 8580 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8581 THEN '' 8582 ELSE concat("INFO", ';') 8583 END, 8584 CASE 8585 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8586 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8587 THEN concat( 8588 '{genotypeconcordance_tag}=', 8589 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8590 ) 8591 ELSE '' 8592 END 8593 ) 8594 FROM dataframe_genotypeconcordance 8595 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8596 """ 8597 self.conn.execute(sql_update) 8598 8599 # Remove added columns 8600 for added_column in added_columns: 8601 self.drop_column(column=added_column) 8602 8603 # Delete dataframe 8604 del dataframe_genotypeconcordance 8605 gc.collect() 8606 8607 def calculation_barcode(self, tag: str = "barcode") -> None: 8608 """ 8609 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8610 updates the INFO field in the file with the calculated barcode values. 8611 8612 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8613 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8614 the default tag name is set to "barcode", defaults to barcode 8615 :type tag: str (optional) 8616 """ 8617 8618 # if FORMAT and samples 8619 if ( 8620 "FORMAT" in self.get_header_columns_as_list() 8621 and self.get_header_sample_list() 8622 ): 8623 8624 # barcode annotation field 8625 if not tag: 8626 tag = "barcode" 8627 8628 # VCF infos tags 8629 vcf_infos_tags = { 8630 tag: "barcode calculation (VaRank)", 8631 } 8632 8633 # Prefix 8634 prefix = self.get_explode_infos_prefix() 8635 8636 # Field 8637 barcode_infos = prefix + tag 8638 8639 # Variants table 8640 table_variants = self.get_table_variants() 8641 8642 # Header 8643 vcf_reader = self.get_header() 8644 8645 # Create variant id 8646 variant_id_column = self.get_variant_id_column() 8647 added_columns = [variant_id_column] 8648 8649 # variant_id, FORMAT and samples 8650 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8651 self.get_header_sample_list() 8652 ) 8653 8654 # Create dataframe 8655 dataframe_barcode = self.get_query_to_df( 8656 f""" SELECT {samples_fields} FROM {table_variants} """ 8657 ) 8658 8659 # Create barcode column 8660 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8661 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8662 ) 8663 8664 # Add barcode to header 8665 vcf_reader.infos[tag] = vcf.parser._Info( 8666 tag, 8667 ".", 8668 "String", 8669 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8670 "howard calculation", 8671 "0", 8672 self.code_type_map.get("String"), 8673 ) 8674 8675 # Update 8676 sql_update = f""" 8677 UPDATE {table_variants} 8678 SET "INFO" = 8679 concat( 8680 CASE 8681 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8682 THEN '' 8683 ELSE concat("INFO", ';') 8684 END, 8685 CASE 8686 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8687 AND dataframe_barcode."{barcode_infos}" NOT NULL 8688 THEN concat( 8689 '{tag}=', 8690 dataframe_barcode."{barcode_infos}" 8691 ) 8692 ELSE '' 8693 END 8694 ) 8695 FROM dataframe_barcode 8696 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8697 """ 8698 self.conn.execute(sql_update) 8699 8700 # Remove added columns 8701 for added_column in added_columns: 8702 self.drop_column(column=added_column) 8703 8704 # Delete dataframe 8705 del dataframe_barcode 8706 gc.collect() 8707 8708 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8709 """ 8710 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8711 and updates the INFO field in the file with the calculated barcode values. 8712 8713 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8714 the barcode tag that will be added to the VCF file during the calculation process. If no value 8715 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8716 :type tag: str (optional) 8717 """ 8718 8719 # if FORMAT and samples 8720 if ( 8721 "FORMAT" in self.get_header_columns_as_list() 8722 and self.get_header_sample_list() 8723 ): 8724 8725 # barcode annotation field 8726 if not tag: 8727 tag = "BCF" 8728 8729 # VCF infos tags 8730 vcf_infos_tags = { 8731 tag: "barcode family calculation", 8732 f"{tag}S": "barcode family samples", 8733 } 8734 8735 # Param 8736 param = self.get_param() 8737 log.debug(f"param={param}") 8738 8739 # Prefix 8740 prefix = self.get_explode_infos_prefix() 8741 8742 # PED param 8743 ped = ( 8744 param.get("calculation", {}) 8745 .get("calculations", {}) 8746 .get("BARCODEFAMILY", {}) 8747 .get("family_pedigree", None) 8748 ) 8749 log.debug(f"ped={ped}") 8750 8751 # Load PED 8752 if ped: 8753 8754 # Pedigree is a file 8755 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8756 log.debug("Pedigree is file") 8757 with open(full_path(ped)) as ped: 8758 ped = json.load(ped) 8759 8760 # Pedigree is a string 8761 elif isinstance(ped, str): 8762 log.debug("Pedigree is str") 8763 try: 8764 ped = json.loads(ped) 8765 log.debug("Pedigree is json str") 8766 except ValueError as e: 8767 ped_samples = ped.split(",") 8768 ped = {} 8769 for ped_sample in ped_samples: 8770 ped[ped_sample] = ped_sample 8771 8772 # Pedigree is a dict 8773 elif isinstance(ped, dict): 8774 log.debug("Pedigree is dict") 8775 8776 # Pedigree is not well formatted 8777 else: 8778 msg_error = "Pedigree not well formatted" 8779 log.error(msg_error) 8780 raise ValueError(msg_error) 8781 8782 # Construct list 8783 ped_samples = list(ped.values()) 8784 8785 else: 8786 log.debug("Pedigree not defined. Take all samples") 8787 ped_samples = self.get_header_sample_list() 8788 ped = {} 8789 for ped_sample in ped_samples: 8790 ped[ped_sample] = ped_sample 8791 8792 # Check pedigree 8793 if not ped or len(ped) == 0: 8794 msg_error = f"Error in pedigree: samples {ped_samples}" 8795 log.error(msg_error) 8796 raise ValueError(msg_error) 8797 8798 # Log 8799 log.info( 8800 "Calculation 'BARCODEFAMILY' - Samples: " 8801 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8802 ) 8803 log.debug(f"ped_samples={ped_samples}") 8804 8805 # Field 8806 barcode_infos = prefix + tag 8807 8808 # Variants table 8809 table_variants = self.get_table_variants() 8810 8811 # Header 8812 vcf_reader = self.get_header() 8813 8814 # Create variant id 8815 variant_id_column = self.get_variant_id_column() 8816 added_columns = [variant_id_column] 8817 8818 # variant_id, FORMAT and samples 8819 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8820 ped_samples 8821 ) 8822 8823 # Create dataframe 8824 dataframe_barcode = self.get_query_to_df( 8825 f""" SELECT {samples_fields} FROM {table_variants} """ 8826 ) 8827 8828 # Create barcode column 8829 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8830 lambda row: barcode(row, samples=ped_samples), axis=1 8831 ) 8832 8833 # Add barcode family to header 8834 # Add vaf_normalization to header 8835 vcf_reader.formats[tag] = vcf.parser._Format( 8836 id=tag, 8837 num=".", 8838 type="String", 8839 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8840 type_code=self.code_type_map.get("String"), 8841 ) 8842 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8843 id=f"{tag}S", 8844 num=".", 8845 type="String", 8846 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8847 type_code=self.code_type_map.get("String"), 8848 ) 8849 8850 # Update 8851 # for sample in ped_samples: 8852 sql_update_set = [] 8853 for sample in self.get_header_sample_list() + ["FORMAT"]: 8854 if sample in ped_samples: 8855 value = f'dataframe_barcode."{barcode_infos}"' 8856 value_samples = "'" + ",".join(ped_samples) + "'" 8857 elif sample == "FORMAT": 8858 value = f"'{tag}'" 8859 value_samples = f"'{tag}S'" 8860 else: 8861 value = "'.'" 8862 value_samples = "'.'" 8863 format_regex = r"[a-zA-Z0-9\s]" 8864 sql_update_set.append( 8865 f""" 8866 "{sample}" = 8867 concat( 8868 CASE 8869 WHEN {table_variants}."{sample}" = './.' 8870 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8871 ELSE {table_variants}."{sample}" 8872 END, 8873 ':', 8874 {value}, 8875 ':', 8876 {value_samples} 8877 ) 8878 """ 8879 ) 8880 8881 sql_update_set_join = ", ".join(sql_update_set) 8882 sql_update = f""" 8883 UPDATE {table_variants} 8884 SET {sql_update_set_join} 8885 FROM dataframe_barcode 8886 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8887 """ 8888 self.conn.execute(sql_update) 8889 8890 # Remove added columns 8891 for added_column in added_columns: 8892 self.drop_column(column=added_column) 8893 8894 # Delete dataframe 8895 del dataframe_barcode 8896 gc.collect() 8897 8898 def calculation_trio(self) -> None: 8899 """ 8900 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8901 information to the INFO field of each variant. 8902 """ 8903 8904 # if FORMAT and samples 8905 if ( 8906 "FORMAT" in self.get_header_columns_as_list() 8907 and self.get_header_sample_list() 8908 ): 8909 8910 # trio annotation field 8911 trio_tag = "trio" 8912 8913 # VCF infos tags 8914 vcf_infos_tags = { 8915 "trio": "trio calculation", 8916 } 8917 8918 # Param 8919 param = self.get_param() 8920 8921 # Prefix 8922 prefix = self.get_explode_infos_prefix() 8923 8924 # Trio param 8925 trio_ped = ( 8926 param.get("calculation", {}) 8927 .get("calculations", {}) 8928 .get("TRIO", {}) 8929 .get("trio_pedigree", None) 8930 ) 8931 8932 # Load trio 8933 if trio_ped: 8934 8935 # Trio pedigree is a file 8936 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8937 log.debug("TRIO pedigree is file") 8938 with open(full_path(trio_ped)) as trio_ped: 8939 trio_ped = json.load(trio_ped) 8940 8941 # Trio pedigree is a string 8942 elif isinstance(trio_ped, str): 8943 log.debug("TRIO pedigree is str") 8944 try: 8945 trio_ped = json.loads(trio_ped) 8946 log.debug("TRIO pedigree is json str") 8947 except ValueError as e: 8948 trio_samples = trio_ped.split(",") 8949 if len(trio_samples) == 3: 8950 trio_ped = { 8951 "father": trio_samples[0], 8952 "mother": trio_samples[1], 8953 "child": trio_samples[2], 8954 } 8955 log.debug("TRIO pedigree is list str") 8956 else: 8957 msg_error = "TRIO pedigree not well formatted" 8958 log.error(msg_error) 8959 raise ValueError(msg_error) 8960 8961 # Trio pedigree is a dict 8962 elif isinstance(trio_ped, dict): 8963 log.debug("TRIO pedigree is dict") 8964 8965 # Trio pedigree is not well formatted 8966 else: 8967 msg_error = "TRIO pedigree not well formatted" 8968 log.error(msg_error) 8969 raise ValueError(msg_error) 8970 8971 # Construct trio list 8972 trio_samples = [ 8973 trio_ped.get("father", ""), 8974 trio_ped.get("mother", ""), 8975 trio_ped.get("child", ""), 8976 ] 8977 8978 else: 8979 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8980 samples_list = self.get_header_sample_list() 8981 if len(samples_list) >= 3: 8982 trio_samples = self.get_header_sample_list()[0:3] 8983 trio_ped = { 8984 "father": trio_samples[0], 8985 "mother": trio_samples[1], 8986 "child": trio_samples[2], 8987 } 8988 else: 8989 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8990 log.error(msg_error) 8991 raise ValueError(msg_error) 8992 8993 # Check trio pedigree 8994 if not trio_ped or len(trio_ped) != 3: 8995 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8996 log.error(msg_error) 8997 raise ValueError(msg_error) 8998 8999 # Log 9000 log.info( 9001 f"Calculation 'TRIO' - Samples: " 9002 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9003 ) 9004 9005 # Field 9006 trio_infos = prefix + trio_tag 9007 9008 # Variants table 9009 table_variants = self.get_table_variants() 9010 9011 # Header 9012 vcf_reader = self.get_header() 9013 9014 # Create variant id 9015 variant_id_column = self.get_variant_id_column() 9016 added_columns = [variant_id_column] 9017 9018 # variant_id, FORMAT and samples 9019 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9020 self.get_header_sample_list() 9021 ) 9022 9023 # Create dataframe 9024 dataframe_trio = self.get_query_to_df( 9025 f""" SELECT {samples_fields} FROM {table_variants} """ 9026 ) 9027 9028 # Create trio column 9029 dataframe_trio[trio_infos] = dataframe_trio.apply( 9030 lambda row: trio(row, samples=trio_samples), axis=1 9031 ) 9032 9033 # Add trio to header 9034 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9035 trio_tag, 9036 ".", 9037 "String", 9038 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9039 "howard calculation", 9040 "0", 9041 self.code_type_map.get("String"), 9042 ) 9043 9044 # Update 9045 sql_update = f""" 9046 UPDATE {table_variants} 9047 SET "INFO" = 9048 concat( 9049 CASE 9050 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9051 THEN '' 9052 ELSE concat("INFO", ';') 9053 END, 9054 CASE 9055 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9056 AND dataframe_trio."{trio_infos}" NOT NULL 9057 THEN concat( 9058 '{trio_tag}=', 9059 dataframe_trio."{trio_infos}" 9060 ) 9061 ELSE '' 9062 END 9063 ) 9064 FROM dataframe_trio 9065 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9066 """ 9067 self.conn.execute(sql_update) 9068 9069 # Remove added columns 9070 for added_column in added_columns: 9071 self.drop_column(column=added_column) 9072 9073 # Delete dataframe 9074 del dataframe_trio 9075 gc.collect() 9076 9077 def calculation_vaf_normalization(self) -> None: 9078 """ 9079 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9080 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9081 :return: The function does not return anything. 9082 """ 9083 9084 # if FORMAT and samples 9085 if ( 9086 "FORMAT" in self.get_header_columns_as_list() 9087 and self.get_header_sample_list() 9088 ): 9089 9090 # vaf_normalization annotation field 9091 vaf_normalization_tag = "VAF" 9092 9093 # VCF infos tags 9094 vcf_infos_tags = { 9095 "VAF": "VAF Variant Frequency", 9096 } 9097 9098 # Prefix 9099 prefix = self.get_explode_infos_prefix() 9100 9101 # Variants table 9102 table_variants = self.get_table_variants() 9103 9104 # Header 9105 vcf_reader = self.get_header() 9106 9107 # Do not calculate if VAF already exists 9108 if "VAF" in vcf_reader.formats: 9109 log.debug("VAF already on genotypes") 9110 return 9111 9112 # Create variant id 9113 variant_id_column = self.get_variant_id_column() 9114 added_columns = [variant_id_column] 9115 9116 # variant_id, FORMAT and samples 9117 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9118 f""" "{sample}" """ for sample in self.get_header_sample_list() 9119 ) 9120 9121 # Create dataframe 9122 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9123 log.debug(f"query={query}") 9124 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9125 9126 vaf_normalization_set = [] 9127 9128 # for each sample vaf_normalization 9129 for sample in self.get_header_sample_list(): 9130 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9131 lambda row: vaf_normalization(row, sample=sample), axis=1 9132 ) 9133 vaf_normalization_set.append( 9134 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9135 ) 9136 9137 # Add VAF to FORMAT 9138 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9139 "FORMAT" 9140 ].apply(lambda x: str(x) + ":VAF") 9141 vaf_normalization_set.append( 9142 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9143 ) 9144 9145 # Add vaf_normalization to header 9146 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9147 id=vaf_normalization_tag, 9148 num="1", 9149 type="Float", 9150 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9151 type_code=self.code_type_map.get("Float"), 9152 ) 9153 9154 # Create fields to add in INFO 9155 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9156 9157 # Update 9158 sql_update = f""" 9159 UPDATE {table_variants} 9160 SET {sql_vaf_normalization_set} 9161 FROM dataframe_vaf_normalization 9162 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9163 9164 """ 9165 self.conn.execute(sql_update) 9166 9167 # Remove added columns 9168 for added_column in added_columns: 9169 self.drop_column(column=added_column) 9170 9171 # Delete dataframe 9172 del dataframe_vaf_normalization 9173 gc.collect() 9174 9175 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9176 """ 9177 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9178 field in a VCF file and updates the INFO column of the variants table with the calculated 9179 statistics. 9180 9181 :param info: The `info` parameter is a string that represents the type of information for which 9182 genotype statistics are calculated. It is used to generate various VCF info tags for the 9183 statistics, such as the number of occurrences, the list of values, the minimum value, the 9184 maximum value, the mean, the median, defaults to VAF 9185 :type info: str (optional) 9186 """ 9187 9188 # if FORMAT and samples 9189 if ( 9190 "FORMAT" in self.get_header_columns_as_list() 9191 and self.get_header_sample_list() 9192 ): 9193 9194 # vaf_stats annotation field 9195 vaf_stats_tag = info + "_stats" 9196 9197 # VCF infos tags 9198 vcf_infos_tags = { 9199 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9200 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9201 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9202 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9203 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9204 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9205 info 9206 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9207 } 9208 9209 # Prefix 9210 prefix = self.get_explode_infos_prefix() 9211 9212 # Field 9213 vaf_stats_infos = prefix + vaf_stats_tag 9214 9215 # Variants table 9216 table_variants = self.get_table_variants() 9217 9218 # Header 9219 vcf_reader = self.get_header() 9220 9221 # Create variant id 9222 variant_id_column = self.get_variant_id_column() 9223 added_columns = [variant_id_column] 9224 9225 # variant_id, FORMAT and samples 9226 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9227 self.get_header_sample_list() 9228 ) 9229 9230 # Create dataframe 9231 dataframe_vaf_stats = self.get_query_to_df( 9232 f""" SELECT {samples_fields} FROM {table_variants} """ 9233 ) 9234 9235 # Create vaf_stats column 9236 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9237 lambda row: genotype_stats( 9238 row, samples=self.get_header_sample_list(), info=info 9239 ), 9240 axis=1, 9241 ) 9242 9243 # List of vcf tags 9244 sql_vaf_stats_fields = [] 9245 9246 # Check all VAF stats infos 9247 for stat in vcf_infos_tags: 9248 9249 # Extract stats 9250 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9251 lambda x: dict(x).get(stat, "") 9252 ) 9253 9254 # Add snpeff_hgvs to header 9255 vcf_reader.infos[stat] = vcf.parser._Info( 9256 stat, 9257 ".", 9258 "String", 9259 vcf_infos_tags.get(stat, "genotype statistics"), 9260 "howard calculation", 9261 "0", 9262 self.code_type_map.get("String"), 9263 ) 9264 9265 if len(sql_vaf_stats_fields): 9266 sep = ";" 9267 else: 9268 sep = "" 9269 9270 # Create fields to add in INFO 9271 sql_vaf_stats_fields.append( 9272 f""" 9273 CASE 9274 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9275 THEN concat( 9276 '{sep}{stat}=', 9277 dataframe_vaf_stats."{stat}" 9278 ) 9279 ELSE '' 9280 END 9281 """ 9282 ) 9283 9284 # SQL set for update 9285 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9286 9287 # Update 9288 sql_update = f""" 9289 UPDATE {table_variants} 9290 SET "INFO" = 9291 concat( 9292 CASE 9293 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9294 THEN '' 9295 ELSE concat("INFO", ';') 9296 END, 9297 {sql_vaf_stats_fields_set} 9298 ) 9299 FROM dataframe_vaf_stats 9300 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9301 9302 """ 9303 self.conn.execute(sql_update) 9304 9305 # Remove added columns 9306 for added_column in added_columns: 9307 self.drop_column(column=added_column) 9308 9309 # Delete dataframe 9310 del dataframe_vaf_stats 9311 gc.collect() 9312 9313 def calculation_transcripts_annotation( 9314 self, info_json: str = None, info_format: str = None 9315 ) -> None: 9316 """ 9317 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9318 field to it if transcripts are available. 9319 9320 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9321 is a string parameter that represents the information field to be used in the transcripts JSON. 9322 It is used to specify the JSON format for the transcripts information. If no value is provided 9323 when calling the method, it defaults to " 9324 :type info_json: str 9325 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9326 method is a string parameter that specifies the format of the information field to be used in 9327 the transcripts JSON. It is used to define the format of the information field 9328 :type info_format: str 9329 """ 9330 9331 # Create transcripts table 9332 transcripts_table = self.create_transcript_view() 9333 9334 # Add info field 9335 if transcripts_table: 9336 self.transcript_view_to_variants( 9337 transcripts_table=transcripts_table, 9338 transcripts_info_field_json=info_json, 9339 transcripts_info_field_format=info_format, 9340 ) 9341 else: 9342 log.info("No Transcripts to process. Check param.json file configuration") 9343 9344 def calculation_transcripts_prioritization(self) -> None: 9345 """ 9346 The function `calculation_transcripts_prioritization` creates a transcripts table and 9347 prioritizes transcripts based on certain criteria. 9348 """ 9349 9350 # Create transcripts table 9351 transcripts_table = self.create_transcript_view() 9352 9353 # Add info field 9354 if transcripts_table: 9355 self.transcripts_prioritization(transcripts_table=transcripts_table) 9356 else: 9357 log.info("No Transcripts to process. Check param.json file configuration") 9358 9359 ############### 9360 # Transcripts # 9361 ############### 9362 9363 def transcripts_prioritization( 9364 self, transcripts_table: str = None, param: dict = {} 9365 ) -> bool: 9366 """ 9367 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9368 and updates the variants table with the prioritized information. 9369 9370 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9371 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9372 This parameter is used to identify the table where the transcripts data is stored for the 9373 prioritization process 9374 :type transcripts_table: str 9375 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9376 that contains various configuration settings for the prioritization process of transcripts. It 9377 is used to customize the behavior of the prioritization algorithm and includes settings such as 9378 the prefix for prioritization fields, default profiles, and other 9379 :type param: dict 9380 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9381 transcripts prioritization process is successfully completed, and `False` if there are any 9382 issues or if no profile is defined for transcripts prioritization. 9383 """ 9384 9385 log.debug("Start transcripts prioritization...") 9386 9387 # Param 9388 if not param: 9389 param = self.get_param() 9390 9391 # Variants table 9392 table_variants = self.get_table_variants() 9393 log.debug(f"transcripts_table={transcripts_table}") 9394 # Transcripts table 9395 if transcripts_table is None: 9396 log.debug(f"transcripts_table={transcripts_table}") 9397 transcripts_table = self.create_transcript_view( 9398 transcripts_table="transcripts", param=param 9399 ) 9400 log.debug(f"transcripts_table={transcripts_table}") 9401 if transcripts_table is None: 9402 msg_err = "No Transcripts table availalble" 9403 log.error(msg_err) 9404 raise ValueError(msg_err) 9405 9406 # Get transcripts columns 9407 columns_as_list_query = f""" 9408 DESCRIBE {transcripts_table} 9409 """ 9410 columns_as_list = list( 9411 self.get_query_to_df(columns_as_list_query)["column_name"] 9412 ) 9413 9414 # Create INFO if not exists 9415 if "INFO" not in columns_as_list: 9416 query_add_info = f""" 9417 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9418 """ 9419 self.execute_query(query_add_info) 9420 9421 # Prioritization param and Force only PZ Score and Flag 9422 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9423 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9424 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9425 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9426 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9427 pz_profile_default = ( 9428 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9429 ) 9430 9431 # Exit if no profile 9432 if pz_profile_default is None: 9433 log.warning("No profile defined for transcripts prioritization") 9434 return False 9435 9436 # Prioritization 9437 prioritization_result = self.prioritization( 9438 table=transcripts_table, 9439 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9440 ) 9441 if not prioritization_result: 9442 log.warning("Transcripts prioritization not processed") 9443 return False 9444 9445 # Explode PZ fields 9446 self.explode_infos( 9447 table=transcripts_table, 9448 fields=param.get("transcripts", {}) 9449 .get("prioritization", {}) 9450 .get("pzfields", []), 9451 ) 9452 9453 # Export Transcripts prioritization infos to variants table 9454 query_update = f""" 9455 WITH RankedTranscripts AS ( 9456 SELECT 9457 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9458 ROW_NUMBER() OVER ( 9459 PARTITION BY "#CHROM", POS, REF, ALT 9460 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9461 ) AS rn 9462 FROM 9463 {transcripts_table} 9464 ) 9465 UPDATE {table_variants} 9466 SET 9467 INFO = CONCAT(CASE 9468 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9469 THEN '' 9470 ELSE concat("INFO", ';') 9471 END, 9472 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9473 ) 9474 FROM 9475 RankedTranscripts 9476 WHERE 9477 rn = 1 9478 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9479 AND variants."POS" = RankedTranscripts."POS" 9480 AND variants."REF" = RankedTranscripts."REF" 9481 AND variants."ALT" = RankedTranscripts."ALT" 9482 9483 """ 9484 self.execute_query(query=query_update) 9485 9486 # Add PZ Transcript in header 9487 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9488 pz_fields_transcripts, 9489 ".", 9490 "String", 9491 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9492 "unknown", 9493 "unknown", 9494 code_type_map["String"], 9495 ) 9496 9497 # Return 9498 return True 9499 9500 def create_transcript_view_from_columns_map( 9501 self, 9502 transcripts_table: str = "transcripts", 9503 columns_maps: dict = {}, 9504 added_columns: list = [], 9505 temporary_tables: list = None, 9506 annotation_fields: list = None, 9507 ) -> tuple[list, list, list]: 9508 """ 9509 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9510 specified columns mapping for transcripts data. 9511 9512 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9513 the table where the transcripts data is stored or will be stored in the database. This table 9514 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9515 predictions, etc. It defaults to "transcripts, defaults to transcripts 9516 :type transcripts_table: str (optional) 9517 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9518 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9519 represents a mapping configuration for a specific set of columns. It typically includes details such 9520 as the main transcript column and additional information columns 9521 :type columns_maps: dict 9522 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9523 function is a list that stores the additional columns that will be added to the view being created 9524 based on the columns map provided. These columns are generated by exploding the transcript 9525 information columns along with the main transcript column 9526 :type added_columns: list 9527 :param temporary_tables: The `temporary_tables` parameter in the 9528 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9529 tables created during the process of creating a transcript view from a columns map. These temporary 9530 tables are used to store intermediate results or transformations before the final view is generated 9531 :type temporary_tables: list 9532 :param annotation_fields: The `annotation_fields` parameter in the 9533 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9534 for annotation in the query view creation process. These fields are extracted from the 9535 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9536 :type annotation_fields: list 9537 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9538 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9539 """ 9540 9541 log.debug("Start transcrpts view creation from columns map...") 9542 9543 # "from_columns_map": [ 9544 # { 9545 # "transcripts_column": "Ensembl_transcriptid", 9546 # "transcripts_infos_columns": [ 9547 # "genename", 9548 # "Ensembl_geneid", 9549 # "LIST_S2_score", 9550 # "LIST_S2_pred", 9551 # ], 9552 # }, 9553 # { 9554 # "transcripts_column": "Ensembl_transcriptid", 9555 # "transcripts_infos_columns": [ 9556 # "genename", 9557 # "VARITY_R_score", 9558 # "Aloft_pred", 9559 # ], 9560 # }, 9561 # ], 9562 9563 # Init 9564 if temporary_tables is None: 9565 temporary_tables = [] 9566 if annotation_fields is None: 9567 annotation_fields = [] 9568 9569 # Variants table 9570 table_variants = self.get_table_variants() 9571 9572 for columns_map in columns_maps: 9573 9574 # Transcript column 9575 transcripts_column = columns_map.get("transcripts_column", None) 9576 9577 # Transcripts infos columns 9578 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9579 9580 if transcripts_column is not None: 9581 9582 # Explode 9583 added_columns += self.explode_infos( 9584 fields=[transcripts_column] + transcripts_infos_columns 9585 ) 9586 9587 # View clauses 9588 clause_select = [] 9589 for field in [transcripts_column] + transcripts_infos_columns: 9590 clause_select.append( 9591 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9592 ) 9593 if field not in [transcripts_column]: 9594 annotation_fields.append(field) 9595 9596 # Querey View 9597 query = f""" 9598 SELECT 9599 "#CHROM", POS, REF, ALT, 9600 "{transcripts_column}" AS 'transcript', 9601 {", ".join(clause_select)} 9602 FROM ( 9603 SELECT 9604 "#CHROM", POS, REF, ALT, 9605 {", ".join(clause_select)} 9606 FROM {table_variants} 9607 ) 9608 WHERE "{transcripts_column}" IS NOT NULL 9609 """ 9610 9611 # Create temporary table 9612 temporary_table = transcripts_table + "".join( 9613 random.choices(string.ascii_uppercase + string.digits, k=10) 9614 ) 9615 9616 # Temporary_tables 9617 temporary_tables.append(temporary_table) 9618 query_view = f""" 9619 CREATE TEMPORARY TABLE {temporary_table} 9620 AS ({query}) 9621 """ 9622 self.execute_query(query=query_view) 9623 9624 return added_columns, temporary_tables, annotation_fields 9625 9626 def create_transcript_view_from_column_format( 9627 self, 9628 transcripts_table: str = "transcripts", 9629 column_formats: dict = {}, 9630 temporary_tables: list = None, 9631 annotation_fields: list = None, 9632 ) -> tuple[list, list, list]: 9633 """ 9634 The `create_transcript_view_from_column_format` function generates a transcript view based on 9635 specified column formats, adds additional columns and annotation fields, and returns the list of 9636 temporary tables and annotation fields. 9637 9638 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9639 the table containing the transcripts data. This table will be used as the base table for creating 9640 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9641 different table name if needed, defaults to transcripts 9642 :type transcripts_table: str (optional) 9643 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9644 about the columns to be used for creating the transcript view. Each entry in the dictionary 9645 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9646 the provided code snippet: 9647 :type column_formats: dict 9648 :param temporary_tables: The `temporary_tables` parameter in the 9649 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9650 views created during the process of creating a transcript view from a column format. These temporary 9651 views are used to manipulate and extract data before generating the final transcript view. It 9652 :type temporary_tables: list 9653 :param annotation_fields: The `annotation_fields` parameter in the 9654 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9655 that are extracted from the temporary views created during the process. These annotation fields are 9656 obtained by querying the temporary views and extracting the column names excluding specific columns 9657 like `#CH 9658 :type annotation_fields: list 9659 :return: The `create_transcript_view_from_column_format` function returns two lists: 9660 `temporary_tables` and `annotation_fields`. 9661 """ 9662 9663 log.debug("Start transcrpts view creation from column format...") 9664 9665 # "from_column_format": [ 9666 # { 9667 # "transcripts_column": "ANN", 9668 # "transcripts_infos_column": "Feature_ID", 9669 # } 9670 # ], 9671 9672 # Init 9673 if temporary_tables is None: 9674 temporary_tables = [] 9675 if annotation_fields is None: 9676 annotation_fields = [] 9677 9678 for column_format in column_formats: 9679 9680 # annotation field and transcript annotation field 9681 annotation_field = column_format.get("transcripts_column", "ANN") 9682 transcript_annotation = column_format.get( 9683 "transcripts_infos_column", "Feature_ID" 9684 ) 9685 9686 # Temporary View name 9687 temporary_view_name = transcripts_table + "".join( 9688 random.choices(string.ascii_uppercase + string.digits, k=10) 9689 ) 9690 9691 # Create temporary view name 9692 temporary_view_name = self.annotation_format_to_table( 9693 uniquify=True, 9694 annotation_field=annotation_field, 9695 view_name=temporary_view_name, 9696 annotation_id=transcript_annotation, 9697 ) 9698 9699 # Annotation fields 9700 if temporary_view_name: 9701 query_annotation_fields = f""" 9702 SELECT * 9703 FROM ( 9704 DESCRIBE SELECT * 9705 FROM {temporary_view_name} 9706 ) 9707 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9708 """ 9709 df_annotation_fields = self.get_query_to_df( 9710 query=query_annotation_fields 9711 ) 9712 9713 # Add temporary view and annotation fields 9714 temporary_tables.append(temporary_view_name) 9715 annotation_fields += list(set(df_annotation_fields["column_name"])) 9716 9717 return temporary_tables, annotation_fields 9718 9719 def create_transcript_view( 9720 self, 9721 transcripts_table: str = None, 9722 transcripts_table_drop: bool = True, 9723 param: dict = {}, 9724 ) -> str: 9725 """ 9726 The `create_transcript_view` function generates a transcript view by processing data from a 9727 specified table based on provided parameters and structural information. 9728 9729 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9730 is used to specify the name of the table that will store the final transcript view data. If a table 9731 name is not provided, the function will create a new table to store the transcript view data, and by 9732 default,, defaults to transcripts 9733 :type transcripts_table: str (optional) 9734 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9735 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9736 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9737 the function will drop the existing transcripts table if it exists, defaults to True 9738 :type transcripts_table_drop: bool (optional) 9739 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9740 contains information needed to create a transcript view. It includes details such as the structure 9741 of the transcripts, columns mapping, column formats, and other necessary information for generating 9742 the view. This parameter allows for flexibility and customization 9743 :type param: dict 9744 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9745 created or modified during the execution of the function. 9746 """ 9747 9748 log.debug("Start transcripts view creation...") 9749 9750 # Default 9751 transcripts_table_default = "transcripts" 9752 9753 # Param 9754 if not param: 9755 param = self.get_param() 9756 9757 # Struct 9758 struct = param.get("transcripts", {}).get("struct", None) 9759 9760 if struct: 9761 9762 # Transcripts table 9763 if transcripts_table is None: 9764 transcripts_table = param.get("transcripts", {}).get( 9765 "table", transcripts_table_default 9766 ) 9767 9768 # added_columns 9769 added_columns = [] 9770 9771 # Temporary tables 9772 temporary_tables = [] 9773 9774 # Annotation fields 9775 annotation_fields = [] 9776 9777 # from columns map 9778 columns_maps = struct.get("from_columns_map", []) 9779 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 9780 self.create_transcript_view_from_columns_map( 9781 transcripts_table=transcripts_table, 9782 columns_maps=columns_maps, 9783 added_columns=added_columns, 9784 temporary_tables=temporary_tables, 9785 annotation_fields=annotation_fields, 9786 ) 9787 ) 9788 added_columns += added_columns_tmp 9789 temporary_tables += temporary_tables_tmp 9790 annotation_fields += annotation_fields_tmp 9791 9792 # from column format 9793 column_formats = struct.get("from_column_format", []) 9794 temporary_tables_tmp, annotation_fields_tmp = ( 9795 self.create_transcript_view_from_column_format( 9796 transcripts_table=transcripts_table, 9797 column_formats=column_formats, 9798 temporary_tables=temporary_tables, 9799 annotation_fields=annotation_fields, 9800 ) 9801 ) 9802 temporary_tables += temporary_tables_tmp 9803 annotation_fields += annotation_fields_tmp 9804 9805 # Merge temporary tables query 9806 query_merge = "" 9807 for temporary_table in temporary_tables: 9808 9809 # First temporary table 9810 if not query_merge: 9811 query_merge = f""" 9812 SELECT * FROM {temporary_table} 9813 """ 9814 # other temporary table (using UNION) 9815 else: 9816 query_merge += f""" 9817 UNION BY NAME SELECT * FROM {temporary_table} 9818 """ 9819 9820 # Merge on transcript 9821 query_merge_on_transcripts_annotation_fields = [] 9822 # Aggregate all annotations fields 9823 for annotation_field in set(annotation_fields): 9824 query_merge_on_transcripts_annotation_fields.append( 9825 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 9826 ) 9827 # Query for transcripts view 9828 query_merge_on_transcripts = f""" 9829 SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 9830 FROM ({query_merge}) 9831 GROUP BY "#CHROM", POS, REF, ALT, transcript 9832 """ 9833 9834 # Drop transcript view is necessary 9835 if transcripts_table_drop: 9836 query_drop = f""" 9837 DROP TABLE IF EXISTS {transcripts_table}; 9838 """ 9839 self.execute_query(query=query_drop) 9840 9841 # Merge and create transcript view 9842 query_create_view = f""" 9843 CREATE TABLE IF NOT EXISTS {transcripts_table} 9844 AS {query_merge_on_transcripts} 9845 """ 9846 self.execute_query(query=query_create_view) 9847 9848 # Remove added columns 9849 for added_column in added_columns: 9850 self.drop_column(column=added_column) 9851 9852 else: 9853 9854 transcripts_table = None 9855 9856 return transcripts_table 9857 9858 def annotation_format_to_table( 9859 self, 9860 uniquify: bool = True, 9861 annotation_field: str = "ANN", 9862 annotation_id: str = "Feature_ID", 9863 view_name: str = "transcripts", 9864 ) -> str: 9865 """ 9866 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 9867 table format. 9868 9869 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 9870 values in the output or not. If set to `True`, the function will make sure that the output values 9871 are unique, defaults to True 9872 :type uniquify: bool (optional) 9873 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 9874 contains the annotation information for each variant. This field is used to extract the annotation 9875 details for further processing in the function, defaults to ANN 9876 :type annotation_field: str (optional) 9877 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 9878 used to specify the identifier for the annotation feature. This identifier will be used as a column 9879 name in the resulting table or view that is created based on the annotation data. It helps in 9880 uniquely identifying each annotation entry in the, defaults to Feature_ID 9881 :type annotation_id: str (optional) 9882 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 9883 specify the name of the temporary table that will be created to store the transformed annotation 9884 data. This table will hold the extracted information from the annotation field in a structured 9885 format for further processing or analysis, defaults to transcripts 9886 :type view_name: str (optional) 9887 :return: The function `annotation_format_to_table` is returning the name of the view created, which 9888 is stored in the variable `view_name`. 9889 """ 9890 9891 # Annotation field 9892 annotation_format = "annotation_explode" 9893 9894 # Transcript annotation 9895 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 9896 9897 # Prefix 9898 prefix = self.get_explode_infos_prefix() 9899 if prefix: 9900 prefix = "INFO/" 9901 9902 # Annotation fields 9903 annotation_infos = prefix + annotation_field 9904 annotation_format_infos = prefix + annotation_format 9905 9906 # Variants table 9907 table_variants = self.get_table_variants() 9908 9909 # Header 9910 vcf_reader = self.get_header() 9911 9912 # Add columns 9913 added_columns = [] 9914 9915 # Explode HGVS field in column 9916 added_columns += self.explode_infos(fields=[annotation_field]) 9917 9918 if annotation_field in vcf_reader.infos: 9919 9920 # Extract ANN header 9921 ann_description = vcf_reader.infos[annotation_field].desc 9922 pattern = r"'(.+?)'" 9923 match = re.search(pattern, ann_description) 9924 if match: 9925 ann_header_match = match.group(1).split(" | ") 9926 ann_header = [] 9927 ann_header_desc = {} 9928 for i in range(len(ann_header_match)): 9929 ann_header_info = "".join( 9930 char for char in ann_header_match[i] if char.isalnum() 9931 ) 9932 ann_header.append(ann_header_info) 9933 ann_header_desc[ann_header_info] = ann_header_match[i] 9934 if not ann_header_desc: 9935 raise ValueError("Invalid header description format") 9936 else: 9937 raise ValueError("Invalid header description format") 9938 9939 # Create variant id 9940 variant_id_column = self.get_variant_id_column() 9941 added_columns += [variant_id_column] 9942 9943 # Create dataframe 9944 dataframe_annotation_format = self.get_query_to_df( 9945 f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 9946 ) 9947 9948 # Create annotation columns 9949 dataframe_annotation_format[ 9950 annotation_format_infos 9951 ] = dataframe_annotation_format[annotation_infos].apply( 9952 lambda x: explode_annotation_format( 9953 annotation=str(x), 9954 uniquify=uniquify, 9955 output_format="JSON", 9956 prefix="", 9957 header=list(ann_header_desc.values()), 9958 ) 9959 ) 9960 9961 # Find keys 9962 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 9963 df_keys = self.get_query_to_df(query=query_json) 9964 9965 # Check keys 9966 query_json_key = [] 9967 for _, row in df_keys.iterrows(): 9968 9969 # Key 9970 key = row.iloc[0] 9971 9972 # key_clean 9973 key_clean = "".join(char for char in key if char.isalnum()) 9974 9975 # Type 9976 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 9977 9978 # Get DataFrame from query 9979 df_json_type = self.get_query_to_df(query=query_json_type) 9980 9981 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 9982 with pd.option_context("future.no_silent_downcasting", True): 9983 df_json_type.fillna(value="", inplace=True) 9984 replace_dict = {None: np.nan, "": np.nan} 9985 df_json_type.replace(replace_dict, inplace=True) 9986 df_json_type.dropna(inplace=True) 9987 9988 # Detect column type 9989 column_type = detect_column_type(df_json_type[key_clean]) 9990 9991 # Append 9992 query_json_key.append( 9993 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 9994 ) 9995 9996 # Create view 9997 query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));""" 9998 self.execute_query(query=query_view) 9999 10000 else: 10001 10002 # Return None 10003 view_name = None 10004 10005 # Remove added columns 10006 for added_column in added_columns: 10007 self.drop_column(column=added_column) 10008 10009 return view_name 10010 10011 def transcript_view_to_variants( 10012 self, 10013 transcripts_table: str = None, 10014 transcripts_column_id: str = None, 10015 transcripts_info_json: str = None, 10016 transcripts_info_field_json: str = None, 10017 transcripts_info_format: str = None, 10018 transcripts_info_field_format: str = None, 10019 param: dict = {}, 10020 ) -> bool: 10021 """ 10022 The `transcript_view_to_variants` function updates a variants table with information from 10023 transcripts in JSON format. 10024 10025 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 10026 table containing the transcripts data. If this parameter is not provided, the function will 10027 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 10028 :type transcripts_table: str 10029 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 10030 column in the `transcripts_table` that contains the unique identifier for each transcript. This 10031 identifier is used to match transcripts with variants in the database 10032 :type transcripts_column_id: str 10033 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 10034 of the column in the variants table where the transcripts information will be stored in JSON 10035 format. This parameter allows you to define the column in the variants table that will hold the 10036 JSON-formatted information about transcripts 10037 :type transcripts_info_json: str 10038 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 10039 specify the field in the VCF header that will contain information about transcripts in JSON 10040 format. This field will be added to the VCF header as an INFO field with the specified name 10041 :type transcripts_info_field_json: str 10042 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 10043 format of the information about transcripts that will be stored in the variants table. This 10044 format can be used to define how the transcript information will be structured or displayed 10045 within the variants table 10046 :type transcripts_info_format: str 10047 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 10048 specify the field in the VCF header that will contain information about transcripts in a 10049 specific format. This field will be added to the VCF header as an INFO field with the specified 10050 name 10051 :type transcripts_info_field_format: str 10052 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 10053 that contains various configuration settings related to transcripts. It is used to provide 10054 default values for certain parameters if they are not explicitly provided when calling the 10055 method. The `param` dictionary can be passed as an argument 10056 :type param: dict 10057 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 10058 if the operation is successful and `False` if certain conditions are not met. 10059 """ 10060 10061 msg_info_prefix = "Start transcripts view to variants annotations" 10062 10063 log.debug(f"{msg_info_prefix}...") 10064 10065 # Default 10066 transcripts_table_default = "transcripts" 10067 transcripts_column_id_default = "transcript" 10068 transcripts_info_json_default = None 10069 transcripts_info_format_default = None 10070 transcripts_info_field_json_default = None 10071 transcripts_info_field_format_default = None 10072 10073 # Param 10074 if not param: 10075 param = self.get_param() 10076 10077 # Transcripts table 10078 if transcripts_table is None: 10079 transcripts_table = param.get("transcripts", {}).get( 10080 "table", transcripts_table_default 10081 ) 10082 10083 # Transcripts column ID 10084 if transcripts_column_id is None: 10085 transcripts_column_id = param.get("transcripts", {}).get( 10086 "column_id", transcripts_column_id_default 10087 ) 10088 10089 # Transcripts info json 10090 if transcripts_info_json is None: 10091 transcripts_info_json = param.get("transcripts", {}).get( 10092 "transcripts_info_json", transcripts_info_json_default 10093 ) 10094 10095 # Transcripts info field JSON 10096 if transcripts_info_field_json is None: 10097 transcripts_info_field_json = param.get("transcripts", {}).get( 10098 "transcripts_info_field_json", transcripts_info_field_json_default 10099 ) 10100 # if transcripts_info_field_json is not None and transcripts_info_json is None: 10101 # transcripts_info_json = transcripts_info_field_json 10102 10103 # Transcripts info format 10104 if transcripts_info_format is None: 10105 transcripts_info_format = param.get("transcripts", {}).get( 10106 "transcripts_info_format", transcripts_info_format_default 10107 ) 10108 10109 # Transcripts info field FORMAT 10110 if transcripts_info_field_format is None: 10111 transcripts_info_field_format = param.get("transcripts", {}).get( 10112 "transcripts_info_field_format", transcripts_info_field_format_default 10113 ) 10114 # if ( 10115 # transcripts_info_field_format is not None 10116 # and transcripts_info_format is None 10117 # ): 10118 # transcripts_info_format = transcripts_info_field_format 10119 10120 # Variants table 10121 table_variants = self.get_table_variants() 10122 10123 # Check info columns param 10124 if ( 10125 transcripts_info_json is None 10126 and transcripts_info_field_json is None 10127 and transcripts_info_format is None 10128 and transcripts_info_field_format is None 10129 ): 10130 return False 10131 10132 # Transcripts infos columns 10133 query_transcripts_infos_columns = f""" 10134 SELECT * 10135 FROM ( 10136 DESCRIBE SELECT * FROM {transcripts_table} 10137 ) 10138 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10139 """ 10140 transcripts_infos_columns = list( 10141 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10142 ) 10143 10144 # View results 10145 clause_select = [] 10146 clause_to_json = [] 10147 clause_to_format = [] 10148 for field in transcripts_infos_columns: 10149 clause_select.append( 10150 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10151 ) 10152 clause_to_json.append(f""" '{field}': "{field}" """) 10153 clause_to_format.append(f""" "{field}" """) 10154 10155 # Update 10156 update_set_json = [] 10157 update_set_format = [] 10158 10159 # VCF header 10160 vcf_reader = self.get_header() 10161 10162 # Transcripts to info column in JSON 10163 if transcripts_info_json is not None: 10164 10165 # Create column on variants table 10166 self.add_column( 10167 table_name=table_variants, 10168 column_name=transcripts_info_json, 10169 column_type="JSON", 10170 default_value=None, 10171 drop=False, 10172 ) 10173 10174 # Add header 10175 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10176 transcripts_info_json, 10177 ".", 10178 "String", 10179 "Transcripts in JSON format", 10180 "unknwon", 10181 "unknwon", 10182 self.code_type_map["String"], 10183 ) 10184 10185 # Add to update 10186 update_set_json.append( 10187 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10188 ) 10189 10190 # Transcripts to info field in JSON 10191 if transcripts_info_field_json is not None: 10192 10193 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 10194 10195 # Add to update 10196 update_set_json.append( 10197 f""" 10198 INFO = concat( 10199 CASE 10200 WHEN INFO NOT IN ('', '.') 10201 THEN INFO 10202 ELSE '' 10203 END, 10204 CASE 10205 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10206 THEN concat( 10207 ';{transcripts_info_field_json}=', 10208 t.{transcripts_info_json} 10209 ) 10210 ELSE '' 10211 END 10212 ) 10213 """ 10214 ) 10215 10216 # Add header 10217 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 10218 transcripts_info_field_json, 10219 ".", 10220 "String", 10221 "Transcripts in JSON format", 10222 "unknwon", 10223 "unknwon", 10224 self.code_type_map["String"], 10225 ) 10226 10227 if update_set_json: 10228 10229 # Update query 10230 query_update = f""" 10231 UPDATE {table_variants} 10232 SET {", ".join(update_set_json)} 10233 FROM 10234 ( 10235 SELECT 10236 "#CHROM", POS, REF, ALT, 10237 concat( 10238 '{{', 10239 string_agg( 10240 '"' || "{transcripts_column_id}" || '":' || 10241 to_json(json_output) 10242 ), 10243 '}}' 10244 )::JSON AS {transcripts_info_json} 10245 FROM 10246 ( 10247 SELECT 10248 "#CHROM", POS, REF, ALT, 10249 "{transcripts_column_id}", 10250 to_json( 10251 {{{",".join(clause_to_json)}}} 10252 )::JSON AS json_output 10253 FROM 10254 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10255 WHERE "{transcripts_column_id}" IS NOT NULL 10256 ) 10257 GROUP BY "#CHROM", POS, REF, ALT 10258 ) AS t 10259 WHERE {table_variants}."#CHROM" = t."#CHROM" 10260 AND {table_variants}."POS" = t."POS" 10261 AND {table_variants}."REF" = t."REF" 10262 AND {table_variants}."ALT" = t."ALT" 10263 """ 10264 10265 self.execute_query(query=query_update) 10266 10267 # Transcripts to info column in FORMAT 10268 if transcripts_info_format is not None: 10269 10270 # Create column on variants table 10271 self.add_column( 10272 table_name=table_variants, 10273 column_name=transcripts_info_format, 10274 column_type="VARCHAR", 10275 default_value=None, 10276 drop=False, 10277 ) 10278 10279 # Add header 10280 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 10281 transcripts_info_format, 10282 ".", 10283 "String", 10284 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10285 "unknwon", 10286 "unknwon", 10287 self.code_type_map["String"], 10288 ) 10289 10290 # Add to update 10291 update_set_format.append( 10292 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 10293 ) 10294 10295 # Transcripts to info field in JSON 10296 if transcripts_info_field_format is not None: 10297 10298 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 10299 10300 # Add to update 10301 update_set_format.append( 10302 f""" 10303 INFO = concat( 10304 CASE 10305 WHEN INFO NOT IN ('', '.') 10306 THEN INFO 10307 ELSE '' 10308 END, 10309 CASE 10310 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 10311 THEN concat( 10312 ';{transcripts_info_field_format}=', 10313 t.{transcripts_info_format} 10314 ) 10315 ELSE '' 10316 END 10317 ) 10318 """ 10319 ) 10320 10321 # Add header 10322 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 10323 transcripts_info_field_format, 10324 ".", 10325 "String", 10326 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10327 "unknwon", 10328 "unknwon", 10329 self.code_type_map["String"], 10330 ) 10331 10332 if update_set_format: 10333 10334 # Update query 10335 query_update = f""" 10336 UPDATE {table_variants} 10337 SET {", ".join(update_set_format)} 10338 FROM 10339 ( 10340 SELECT 10341 "#CHROM", POS, REF, ALT, 10342 string_agg({transcripts_info_format}) AS {transcripts_info_format} 10343 FROM 10344 ( 10345 SELECT 10346 "#CHROM", POS, REF, ALT, 10347 "{transcripts_column_id}", 10348 concat( 10349 "{transcripts_column_id}", 10350 '|', 10351 {", '|', ".join(clause_to_format)} 10352 ) AS {transcripts_info_format} 10353 FROM 10354 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10355 ) 10356 GROUP BY "#CHROM", POS, REF, ALT 10357 ) AS t 10358 WHERE {table_variants}."#CHROM" = t."#CHROM" 10359 AND {table_variants}."POS" = t."POS" 10360 AND {table_variants}."REF" = t."REF" 10361 AND {table_variants}."ALT" = t."ALT" 10362 """ 10363 10364 self.execute_query(query=query_update) 10365 10366 return True
36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Load data 78 if load: 79 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
81 def set_input(self, input: str = None) -> None: 82 """ 83 The function `set_input` takes a file name as input, extracts the name and extension, and sets 84 attributes in the class accordingly. 85 86 :param input: The `set_input` method in the provided code snippet is used to set attributes 87 related to the input file. Here's a breakdown of the parameters and their usage in the method: 88 :type input: str 89 """ 90 91 if input and not isinstance(input, str): 92 try: 93 self.input = input.name 94 except: 95 log.error(f"Input file '{input} in bad format") 96 raise ValueError(f"Input file '{input} in bad format") 97 else: 98 self.input = input 99 100 # Input format 101 if input: 102 input_name, input_extension = os.path.splitext(self.input) 103 self.input_name = input_name 104 self.input_extension = input_extension 105 self.input_format = self.input_extension.replace(".", "")
The function set_input takes a file name as input, extracts the name and extension, and sets
attributes in the class accordingly.
Parameters
- input: The
set_inputmethod in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
107 def set_config(self, config: dict) -> None: 108 """ 109 The set_config function takes a config object and assigns it as the configuration object for the 110 class. 111 112 :param config: The `config` parameter in the `set_config` function is a dictionary object that 113 contains configuration settings for the class. When you call the `set_config` function with a 114 dictionary object as the argument, it will set that dictionary as the configuration object for 115 the class 116 :type config: dict 117 """ 118 119 self.config = config
The set_config function takes a config object and assigns it as the configuration object for the class.
Parameters
- config: The
configparameter in theset_configfunction is a dictionary object that contains configuration settings for the class. When you call theset_configfunction with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
121 def set_param(self, param: dict) -> None: 122 """ 123 This function sets a parameter object for the class based on the input dictionary. 124 125 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 126 as the `param` attribute of the class instance 127 :type param: dict 128 """ 129 130 self.param = param
This function sets a parameter object for the class based on the input dictionary.
Parameters
- param: The
set_parammethod you provided takes a dictionary object as input and sets it as theparamattribute of the class instance
132 def init_variables(self) -> None: 133 """ 134 This function initializes the variables that will be used in the rest of the class 135 """ 136 137 self.prefix = "howard" 138 self.table_variants = "variants" 139 self.dataframe = None 140 141 self.comparison_map = { 142 "gt": ">", 143 "gte": ">=", 144 "lt": "<", 145 "lte": "<=", 146 "equals": "=", 147 "contains": "SIMILAR TO", 148 } 149 150 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 151 152 self.code_type_map_to_sql = { 153 "Integer": "INTEGER", 154 "String": "VARCHAR", 155 "Float": "FLOAT", 156 "Flag": "VARCHAR", 157 } 158 159 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
161 def get_indexing(self) -> bool: 162 """ 163 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 164 returns False. 165 :return: The value of the indexing parameter. 166 """ 167 168 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
170 def get_connexion_config(self) -> dict: 171 """ 172 The function `get_connexion_config` returns a dictionary containing the configuration for a 173 connection, including the number of threads and memory limit. 174 :return: a dictionary containing the configuration for the Connexion library. 175 """ 176 177 # config 178 config = self.get_config() 179 180 # Connexion config 181 connexion_config = {} 182 threads = self.get_threads() 183 184 # Threads 185 if threads: 186 connexion_config["threads"] = threads 187 188 # Memory 189 # if config.get("memory", None): 190 # connexion_config["memory_limit"] = config.get("memory") 191 if self.get_memory(): 192 connexion_config["memory_limit"] = self.get_memory() 193 194 # Temporary directory 195 if config.get("tmp", None): 196 connexion_config["temp_directory"] = config.get("tmp") 197 198 # Access 199 if config.get("access", None): 200 access = config.get("access") 201 if access in ["RO"]: 202 access = "READ_ONLY" 203 elif access in ["RW"]: 204 access = "READ_WRITE" 205 connexion_db = self.get_connexion_db() 206 if connexion_db in ":memory:": 207 access = "READ_WRITE" 208 connexion_config["access_mode"] = access 209 210 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
212 def get_duckdb_settings(self) -> dict: 213 """ 214 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 215 string. 216 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 217 """ 218 219 # config 220 config = self.get_config() 221 222 # duckdb settings 223 duckdb_settings_dict = {} 224 if config.get("duckdb_settings", None): 225 duckdb_settings = config.get("duckdb_settings") 226 duckdb_settings = full_path(duckdb_settings) 227 # duckdb setting is a file 228 if os.path.exists(duckdb_settings): 229 with open(duckdb_settings) as json_file: 230 duckdb_settings_dict = yaml.safe_load(json_file) 231 # duckdb settings is a string 232 else: 233 duckdb_settings_dict = json.loads(duckdb_settings) 234 235 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
237 def set_connexion_db(self) -> str: 238 """ 239 The function `set_connexion_db` returns the appropriate database connection string based on the 240 input format and connection type. 241 :return: the value of the variable `connexion_db`. 242 """ 243 244 # Default connexion db 245 default_connexion_db = ":memory:" 246 247 # Find connexion db 248 if self.get_input_format() in ["db", "duckdb"]: 249 connexion_db = self.get_input() 250 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 251 connexion_db = default_connexion_db 252 elif self.get_connexion_type() in ["tmpfile"]: 253 tmp_name = tempfile.mkdtemp( 254 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 255 ) 256 connexion_db = f"{tmp_name}/tmp.db" 257 elif self.get_connexion_type() != "": 258 connexion_db = self.get_connexion_type() 259 else: 260 connexion_db = default_connexion_db 261 262 # Set connexion db 263 self.connexion_db = connexion_db 264 265 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
267 def set_connexion(self, conn) -> None: 268 """ 269 The function `set_connexion` creates a connection to a database, with options for different 270 database formats and settings. 271 272 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 273 database. If a connection is not provided, a new connection to an in-memory database is created. 274 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 275 sqlite 276 """ 277 278 # Connexion db 279 connexion_db = self.set_connexion_db() 280 281 # Connexion config 282 connexion_config = self.get_connexion_config() 283 284 # Connexion format 285 connexion_format = self.get_config().get("connexion_format", "duckdb") 286 # Set connexion format 287 self.connexion_format = connexion_format 288 289 # Connexion 290 if not conn: 291 if connexion_format in ["duckdb"]: 292 conn = duckdb.connect(connexion_db, config=connexion_config) 293 # duckDB settings 294 duckdb_settings = self.get_duckdb_settings() 295 if duckdb_settings: 296 for setting in duckdb_settings: 297 setting_value = duckdb_settings.get(setting) 298 if isinstance(setting_value, str): 299 setting_value = f"'{setting_value}'" 300 conn.execute(f"PRAGMA {setting}={setting_value};") 301 elif connexion_format in ["sqlite"]: 302 conn = sqlite3.connect(connexion_db) 303 304 # Set connexion 305 self.conn = conn 306 307 # Log 308 log.debug(f"connexion_format: {connexion_format}") 309 log.debug(f"connexion_db: {connexion_db}") 310 log.debug(f"connexion config: {connexion_config}") 311 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
The function set_connexion creates a connection to a database, with options for different
database formats and settings.
Parameters
- conn: The
connparameter in theset_connexionmethod is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
313 def set_output(self, output: str = None) -> None: 314 """ 315 The `set_output` function in Python sets the output file based on the input or a specified key 316 in the config file, extracting the output name, extension, and format. 317 318 :param output: The `output` parameter in the `set_output` method is used to specify the name of 319 the output file. If the config file has an 'output' key, the method sets the output to the value 320 of that key. If no output is provided, it sets the output to `None` 321 :type output: str 322 """ 323 324 if output and not isinstance(output, str): 325 self.output = output.name 326 else: 327 self.output = output 328 329 # Output format 330 if self.output: 331 output_name, output_extension = os.path.splitext(self.output) 332 self.output_name = output_name 333 self.output_extension = output_extension 334 self.output_format = self.output_extension.replace(".", "") 335 else: 336 self.output_name = None 337 self.output_extension = None 338 self.output_format = None
The set_output function in Python sets the output file based on the input or a specified key
in the config file, extracting the output name, extension, and format.
Parameters
- output: The
outputparameter in theset_outputmethod is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output toNone
340 def set_header(self) -> None: 341 """ 342 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 343 """ 344 345 input_file = self.get_input() 346 default_header_list = [ 347 "##fileformat=VCFv4.2", 348 "#CHROM POS ID REF ALT QUAL FILTER INFO", 349 ] 350 351 # Full path 352 input_file = full_path(input_file) 353 354 if input_file: 355 356 input_format = self.get_input_format() 357 input_compressed = self.get_input_compressed() 358 config = self.get_config() 359 header_list = default_header_list 360 if input_format in [ 361 "vcf", 362 "hdr", 363 "tsv", 364 "csv", 365 "psv", 366 "parquet", 367 "db", 368 "duckdb", 369 ]: 370 # header provided in param 371 if config.get("header_file", None): 372 with open(config.get("header_file"), "rt") as f: 373 header_list = self.read_vcf_header(f) 374 # within a vcf file format (header within input file itsself) 375 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 376 # within a compressed vcf file format (.vcf.gz) 377 if input_compressed: 378 with bgzf.open(input_file, "rt") as f: 379 header_list = self.read_vcf_header(f) 380 # within an uncompressed vcf file format (.vcf) 381 else: 382 with open(input_file, "rt") as f: 383 header_list = self.read_vcf_header(f) 384 # header provided in default external file .hdr 385 elif os.path.exists((input_file + ".hdr")): 386 with open(input_file + ".hdr", "rt") as f: 387 header_list = self.read_vcf_header(f) 388 else: 389 try: # Try to get header info fields and file columns 390 391 with tempfile.TemporaryDirectory() as tmpdir: 392 393 # Create database 394 db_for_header = Database(database=input_file) 395 396 # Get header columns for infos fields 397 db_header_from_columns = ( 398 db_for_header.get_header_from_columns() 399 ) 400 401 # Get real columns in the file 402 db_header_columns = db_for_header.get_columns() 403 404 # Write header file 405 header_file_tmp = os.path.join(tmpdir, "header") 406 f = open(header_file_tmp, "w") 407 vcf.Writer(f, db_header_from_columns) 408 f.close() 409 410 # Replace #CHROM line with rel columns 411 header_list = db_for_header.read_header_file( 412 header_file=header_file_tmp 413 ) 414 header_list[-1] = "\t".join(db_header_columns) 415 416 except: 417 418 log.warning( 419 f"No header for file {input_file}. Set as default VCF header" 420 ) 421 header_list = default_header_list 422 423 else: # try for unknown format ? 424 425 log.error(f"Input file format '{input_format}' not available") 426 raise ValueError(f"Input file format '{input_format}' not available") 427 428 if not header_list: 429 header_list = default_header_list 430 431 # header as list 432 self.header_list = header_list 433 434 # header as VCF object 435 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 436 437 else: 438 439 self.header_list = None 440 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
442 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 443 """ 444 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 445 DataFrame based on the connection format. 446 447 :param query: The `query` parameter in the `get_query_to_df` function is a string that 448 represents the SQL query you want to execute. This query will be used to fetch data from a 449 database and convert it into a pandas DataFrame 450 :type query: str 451 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 452 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 453 function will only fetch up to that number of rows from the database query result. If no limit 454 is specified, 455 :type limit: int 456 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 457 """ 458 459 # Connexion format 460 connexion_format = self.get_connexion_format() 461 462 # Limit in query 463 if limit: 464 pd.set_option("display.max_rows", limit) 465 if connexion_format in ["duckdb"]: 466 df = ( 467 self.conn.execute(query) 468 .fetch_record_batch(limit) 469 .read_next_batch() 470 .to_pandas() 471 ) 472 elif connexion_format in ["sqlite"]: 473 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 474 475 # Full query 476 else: 477 if connexion_format in ["duckdb"]: 478 df = self.conn.execute(query).df() 479 elif connexion_format in ["sqlite"]: 480 df = pd.read_sql_query(query, self.conn) 481 482 return df
The get_query_to_df function takes a query as a string and returns the result as a pandas
DataFrame based on the connection format.
Parameters
- query: The
queryparameter in theget_query_to_dffunction is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame - limit: The
limitparameter in theget_query_to_dffunction is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns
A pandas DataFrame is being returned by the
get_query_to_dffunction.
484 def get_overview(self) -> None: 485 """ 486 The function prints the input, output, config, and dataframe of the current object 487 """ 488 table_variants_from = self.get_table_variants(clause="from") 489 sql_columns = self.get_header_columns_as_sql() 490 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 491 df = self.get_query_to_df(sql_query_export) 492 log.info( 493 "Input: " 494 + str(self.get_input()) 495 + " [" 496 + str(str(self.get_input_format())) 497 + "]" 498 ) 499 log.info( 500 "Output: " 501 + str(self.get_output()) 502 + " [" 503 + str(str(self.get_output_format())) 504 + "]" 505 ) 506 log.info("Config: ") 507 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 508 "\n" 509 ): 510 log.info("\t" + str(d)) 511 log.info("Param: ") 512 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 513 "\n" 514 ): 515 log.info("\t" + str(d)) 516 log.info("Sample list: " + str(self.get_header_sample_list())) 517 log.info("Dataframe: ") 518 for d in str(df).split("\n"): 519 log.info("\t" + str(d)) 520 521 # garbage collector 522 del df 523 gc.collect() 524 525 return None
The function prints the input, output, config, and dataframe of the current object
527 def get_stats(self) -> dict: 528 """ 529 The `get_stats` function calculates and returns various statistics of the current object, 530 including information about the input file, variants, samples, header fields, quality, and 531 SNVs/InDels. 532 :return: a dictionary containing various statistics of the current object. The dictionary has 533 the following structure: 534 """ 535 536 # Log 537 log.info(f"Stats Calculation...") 538 539 # table varaints 540 table_variants_from = self.get_table_variants() 541 542 # stats dict 543 stats = {"Infos": {}} 544 545 ### File 546 input_file = self.get_input() 547 stats["Infos"]["Input file"] = input_file 548 549 # Header 550 header_infos = self.get_header().infos 551 header_formats = self.get_header().formats 552 header_infos_list = list(header_infos) 553 header_formats_list = list(header_formats) 554 555 ### Variants 556 557 stats["Variants"] = {} 558 559 # Variants by chr 560 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 561 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 562 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 563 by=["CHROM"], kind="quicksort" 564 ) 565 566 # Total number of variants 567 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 568 569 # Calculate percentage 570 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 571 lambda x: (x / nb_of_variants) 572 ) 573 574 stats["Variants"]["Number of variants by chromosome"] = ( 575 nb_of_variants_by_chrom.to_dict(orient="index") 576 ) 577 578 stats["Infos"]["Number of variants"] = int(nb_of_variants) 579 580 ### Samples 581 582 # Init 583 samples = {} 584 nb_of_samples = 0 585 586 # Check Samples 587 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 588 log.debug(f"Check samples...") 589 for sample in self.get_header_sample_list(): 590 sql_query_samples = f""" 591 SELECT '{sample}' as sample, 592 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 593 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 594 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 595 FROM {table_variants_from} 596 WHERE ( 597 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 598 AND 599 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 600 ) 601 GROUP BY genotype 602 """ 603 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 604 sample_genotype_count = sql_query_genotype_df["count"].sum() 605 if len(sql_query_genotype_df): 606 nb_of_samples += 1 607 samples[f"{sample} - {sample_genotype_count} variants"] = ( 608 sql_query_genotype_df.to_dict(orient="index") 609 ) 610 611 stats["Samples"] = samples 612 stats["Infos"]["Number of samples"] = nb_of_samples 613 614 # # 615 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 616 # stats["Infos"]["Number of samples"] = nb_of_samples 617 # elif nb_of_samples: 618 # stats["Infos"]["Number of samples"] = "not a VCF format" 619 620 ### INFO and FORMAT fields 621 header_types_df = {} 622 header_types_list = { 623 "List of INFO fields": header_infos, 624 "List of FORMAT fields": header_formats, 625 } 626 i = 0 627 for header_type in header_types_list: 628 629 header_type_infos = header_types_list.get(header_type) 630 header_infos_dict = {} 631 632 for info in header_type_infos: 633 634 i += 1 635 header_infos_dict[i] = {} 636 637 # ID 638 header_infos_dict[i]["id"] = info 639 640 # num 641 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 642 if header_type_infos[info].num in genotype_map.keys(): 643 header_infos_dict[i]["Number"] = genotype_map.get( 644 header_type_infos[info].num 645 ) 646 else: 647 header_infos_dict[i]["Number"] = header_type_infos[info].num 648 649 # type 650 if header_type_infos[info].type: 651 header_infos_dict[i]["Type"] = header_type_infos[info].type 652 else: 653 header_infos_dict[i]["Type"] = "." 654 655 # desc 656 if header_type_infos[info].desc != None: 657 header_infos_dict[i]["Description"] = header_type_infos[info].desc 658 else: 659 header_infos_dict[i]["Description"] = "" 660 661 if len(header_infos_dict): 662 header_types_df[header_type] = pd.DataFrame.from_dict( 663 header_infos_dict, orient="index" 664 ).to_dict(orient="index") 665 666 # Stats 667 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 668 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 669 stats["Header"] = header_types_df 670 671 ### QUAL 672 if "QUAL" in self.get_header_columns(): 673 sql_query_qual = f""" 674 SELECT 675 avg(CAST(QUAL AS INTEGER)) AS Average, 676 min(CAST(QUAL AS INTEGER)) AS Minimum, 677 max(CAST(QUAL AS INTEGER)) AS Maximum, 678 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 679 median(CAST(QUAL AS INTEGER)) AS Median, 680 variance(CAST(QUAL AS INTEGER)) AS Variance 681 FROM {table_variants_from} 682 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 683 """ 684 685 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 686 stats["Quality"] = {"Stats": qual} 687 688 ### SNV and InDel 689 690 sql_query_snv = f""" 691 692 SELECT Type, count FROM ( 693 694 SELECT 695 'Total' AS Type, 696 count(*) AS count 697 FROM {table_variants_from} 698 699 UNION 700 701 SELECT 702 'MNV' AS Type, 703 count(*) AS count 704 FROM {table_variants_from} 705 WHERE len(REF) > 1 AND len(ALT) > 1 706 AND len(REF) = len(ALT) 707 708 UNION 709 710 SELECT 711 'InDel' AS Type, 712 count(*) AS count 713 FROM {table_variants_from} 714 WHERE len(REF) > 1 OR len(ALT) > 1 715 AND len(REF) != len(ALT) 716 717 UNION 718 719 SELECT 720 'SNV' AS Type, 721 count(*) AS count 722 FROM {table_variants_from} 723 WHERE len(REF) = 1 AND len(ALT) = 1 724 725 ) 726 727 ORDER BY count DESC 728 729 """ 730 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 731 732 sql_query_snv_substitution = f""" 733 SELECT 734 concat(REF, '>', ALT) AS 'Substitution', 735 count(*) AS count 736 FROM {table_variants_from} 737 WHERE len(REF) = 1 AND len(ALT) = 1 738 GROUP BY REF, ALT 739 ORDER BY count(*) DESC 740 """ 741 snv_substitution = ( 742 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 743 ) 744 stats["Variants"]["Counts"] = snv_indel 745 stats["Variants"]["Substitutions"] = snv_substitution 746 747 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
749 def stats_to_file(self, file: str = None) -> str: 750 """ 751 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 752 into a JSON object, and writes the JSON object to the specified file. 753 754 :param file: The `file` parameter is a string that represents the file path where the JSON data 755 will be written 756 :type file: str 757 :return: the name of the file that was written to. 758 """ 759 760 # Get stats 761 stats = self.get_stats() 762 763 # Serializing json 764 json_object = json.dumps(stats, indent=4) 765 766 # Writing to sample.json 767 with open(file, "w") as outfile: 768 outfile.write(json_object) 769 770 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
772 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 773 """ 774 The `print_stats` function generates a markdown file and prints the statistics contained in a 775 JSON file in a formatted manner. 776 777 :param output_file: The `output_file` parameter is a string that specifies the path and filename 778 of the output file where the stats will be printed in Markdown format. If no `output_file` is 779 provided, a temporary directory will be created and the stats will be saved in a file named 780 "stats.md" within that 781 :type output_file: str 782 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 783 file where the statistics will be saved. If no value is provided, a temporary directory will be 784 created and a default file name "stats.json" will be used 785 :type json_file: str 786 :return: The function `print_stats` does not return any value. It has a return type annotation 787 of `None`. 788 """ 789 790 # Full path 791 output_file = full_path(output_file) 792 json_file = full_path(json_file) 793 794 with tempfile.TemporaryDirectory() as tmpdir: 795 796 # Files 797 if not output_file: 798 output_file = os.path.join(tmpdir, "stats.md") 799 if not json_file: 800 json_file = os.path.join(tmpdir, "stats.json") 801 802 # Create folders 803 if not os.path.exists(os.path.dirname(output_file)): 804 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 805 if not os.path.exists(os.path.dirname(json_file)): 806 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 807 808 # Create stats JSON file 809 stats_file = self.stats_to_file(file=json_file) 810 811 # Print stats file 812 with open(stats_file) as f: 813 stats = yaml.safe_load(f) 814 815 # Output 816 output_title = [] 817 output_index = [] 818 output = [] 819 820 # Title 821 output_title.append("# HOWARD Stats") 822 823 # Index 824 output_index.append("## Index") 825 826 # Process sections 827 for section in stats: 828 infos = stats.get(section) 829 section_link = "#" + section.lower().replace(" ", "-") 830 output.append(f"## {section}") 831 output_index.append(f"- [{section}]({section_link})") 832 833 if len(infos): 834 for info in infos: 835 try: 836 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 837 is_df = True 838 except: 839 try: 840 df = pd.DataFrame.from_dict( 841 json.loads((infos.get(info))), orient="index" 842 ) 843 is_df = True 844 except: 845 is_df = False 846 if is_df: 847 output.append(f"### {info}") 848 info_link = "#" + info.lower().replace(" ", "-") 849 output_index.append(f" - [{info}]({info_link})") 850 output.append(f"{df.to_markdown(index=False)}") 851 else: 852 output.append(f"- {info}: {infos.get(info)}") 853 else: 854 output.append(f"NA") 855 856 # Write stats in markdown file 857 with open(output_file, "w") as fp: 858 for item in output_title: 859 fp.write("%s\n" % item) 860 for item in output_index: 861 fp.write("%s\n" % item) 862 for item in output: 863 fp.write("%s\n" % item) 864 865 # Output stats in markdown 866 print("") 867 print("\n\n".join(output_title)) 868 print("") 869 print("\n\n".join(output)) 870 print("") 871 872 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
874 def get_input(self) -> str: 875 """ 876 It returns the value of the input variable. 877 :return: The input is being returned. 878 """ 879 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
881 def get_input_format(self, input_file: str = None) -> str: 882 """ 883 This function returns the format of the input variable, either from the provided input file or 884 by prompting for input. 885 886 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 887 represents the file path of the input file. If no `input_file` is provided when calling the 888 method, it will default to `None` 889 :type input_file: str 890 :return: The format of the input variable is being returned. 891 """ 892 893 if not input_file: 894 input_file = self.get_input() 895 input_format = get_file_format(input_file) 896 return input_format
This function returns the format of the input variable, either from the provided input file or by prompting for input.
Parameters
- input_file: The
input_fileparameter in theget_input_formatmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNone
Returns
The format of the input variable is being returned.
898 def get_input_compressed(self, input_file: str = None) -> str: 899 """ 900 The function `get_input_compressed` returns the format of the input variable after compressing 901 it. 902 903 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 904 that represents the file path of the input file. If no `input_file` is provided when calling the 905 method, it will default to `None` and the method will then call `self.get_input()` to 906 :type input_file: str 907 :return: The function `get_input_compressed` returns the compressed format of the input 908 variable. 909 """ 910 911 if not input_file: 912 input_file = self.get_input() 913 input_compressed = get_file_compressed(input_file) 914 return input_compressed
The function get_input_compressed returns the format of the input variable after compressing
it.
Parameters
- input_file: The
input_fileparameter in theget_input_compressedmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNoneand the method will then callself.get_input()to
Returns
The function
get_input_compressedreturns the compressed format of the input variable.
916 def get_output(self) -> str: 917 """ 918 It returns the output of the neuron. 919 :return: The output of the neural network. 920 """ 921 922 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
924 def get_output_format(self, output_file: str = None) -> str: 925 """ 926 The function `get_output_format` returns the format of the input variable or the output file if 927 provided. 928 929 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 930 that represents the file path of the output file. If no `output_file` is provided when calling 931 the method, it will default to the output obtained from the `get_output` method of the class 932 instance. The 933 :type output_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not output_file: 938 output_file = self.get_output() 939 output_format = get_file_format(output_file) 940 941 return output_format
The function get_output_format returns the format of the input variable or the output file if
provided.
Parameters
- output_file: The
output_fileparameter in theget_output_formatmethod is a string that represents the file path of the output file. If nooutput_fileis provided when calling the method, it will default to the output obtained from theget_outputmethod of the class instance. The
Returns
The format of the input variable is being returned.
943 def get_config(self) -> dict: 944 """ 945 It returns the config 946 :return: The config variable is being returned. 947 """ 948 return self.config
It returns the config
Returns
The config variable is being returned.
950 def get_param(self) -> dict: 951 """ 952 It returns the param 953 :return: The param variable is being returned. 954 """ 955 return self.param
It returns the param
Returns
The param variable is being returned.
957 def get_connexion_db(self) -> str: 958 """ 959 It returns the connexion_db attribute of the object 960 :return: The connexion_db is being returned. 961 """ 962 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
964 def get_prefix(self) -> str: 965 """ 966 It returns the prefix of the object. 967 :return: The prefix is being returned. 968 """ 969 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
971 def get_table_variants(self, clause: str = "select") -> str: 972 """ 973 This function returns the table_variants attribute of the object 974 975 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 976 defaults to select (optional) 977 :return: The table_variants attribute of the object. 978 """ 979 980 # Access 981 access = self.get_config().get("access", None) 982 983 # Clauses "select", "where", "update" 984 if clause in ["select", "where", "update"]: 985 table_variants = self.table_variants 986 # Clause "from" 987 elif clause in ["from"]: 988 # For Read Only 989 if self.get_input_format() in ["parquet"] and access in ["RO"]: 990 input_file = self.get_input() 991 table_variants = f"'{input_file}' as variants" 992 # For Read Write 993 else: 994 table_variants = f"{self.table_variants} as variants" 995 else: 996 table_variants = self.table_variants 997 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
999 def get_tmp_dir(self) -> str: 1000 """ 1001 The function `get_tmp_dir` returns the temporary directory path based on configuration 1002 parameters or a default path. 1003 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1004 configuration, parameters, and a default value of "/tmp". 1005 """ 1006 1007 return get_tmp( 1008 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1009 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
1011 def get_connexion_type(self) -> str: 1012 """ 1013 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1014 1015 :return: The connexion type is being returned. 1016 """ 1017 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
1019 def get_connexion(self): 1020 """ 1021 It returns the connection object 1022 1023 :return: The connection object. 1024 """ 1025 return self.conn
It returns the connection object
Returns
The connection object.
1027 def close_connexion(self) -> None: 1028 """ 1029 This function closes the connection to the database. 1030 :return: The connection is being closed. 1031 """ 1032 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
1034 def get_header(self, type: str = "vcf"): 1035 """ 1036 This function returns the header of the VCF file as a list of strings 1037 1038 :param type: the type of header you want to get, defaults to vcf (optional) 1039 :return: The header of the vcf file. 1040 """ 1041 1042 if self.header_vcf: 1043 if type == "vcf": 1044 return self.header_vcf 1045 elif type == "list": 1046 return self.header_list 1047 else: 1048 if type == "vcf": 1049 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1050 return header 1051 elif type == "list": 1052 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1054 def get_header_length(self, file: str = None) -> int: 1055 """ 1056 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1057 line. 1058 1059 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1060 header file. If this argument is provided, the function will read the header from the specified 1061 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1062 :type file: str 1063 :return: the length of the header list, excluding the #CHROM line. 1064 """ 1065 1066 if file: 1067 return len(self.read_vcf_header_file(file=file)) - 1 1068 elif self.get_header(type="list"): 1069 return len(self.get_header(type="list")) - 1 1070 else: 1071 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1073 def get_header_columns(self) -> str: 1074 """ 1075 This function returns the header list of a VCF 1076 1077 :return: The length of the header list. 1078 """ 1079 if self.get_header(): 1080 return self.get_header(type="list")[-1] 1081 else: 1082 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1084 def get_header_columns_as_list(self) -> list: 1085 """ 1086 This function returns the header list of a VCF 1087 1088 :return: The length of the header list. 1089 """ 1090 if self.get_header(): 1091 return self.get_header_columns().strip().split("\t") 1092 else: 1093 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1095 def get_header_columns_as_sql(self) -> str: 1096 """ 1097 This function retruns header length (without #CHROM line) 1098 1099 :return: The length of the header list. 1100 """ 1101 sql_column_list = [] 1102 for col in self.get_header_columns_as_list(): 1103 sql_column_list.append(f'"{col}"') 1104 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1106 def get_header_sample_list(self) -> list: 1107 """ 1108 This function retruns header length (without #CHROM line) 1109 1110 :return: The length of the header list. 1111 """ 1112 return self.header_vcf.samples
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1114 def get_verbose(self) -> bool: 1115 """ 1116 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1117 exist 1118 1119 :return: The value of the key "verbose" in the config dictionary. 1120 """ 1121 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1123 def get_connexion_format(self) -> str: 1124 """ 1125 It returns the connexion format of the object. 1126 :return: The connexion_format is being returned. 1127 """ 1128 connexion_format = self.connexion_format 1129 if connexion_format not in ["duckdb", "sqlite"]: 1130 log.error(f"Unknown connexion format {connexion_format}") 1131 raise ValueError(f"Unknown connexion format {connexion_format}") 1132 else: 1133 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1135 def insert_file_to_table( 1136 self, 1137 file, 1138 columns: str, 1139 header_len: int = 0, 1140 sep: str = "\t", 1141 chunksize: int = 1000000, 1142 ) -> None: 1143 """ 1144 The function reads a file in chunks and inserts each chunk into a table based on the specified 1145 database format. 1146 1147 :param file: The `file` parameter is the file that you want to load into a table. It should be 1148 the path to the file on your system 1149 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1150 should contain the names of the columns in the table where the data will be inserted. The column 1151 names should be separated by commas within the string. For example, if you have columns named 1152 "id", "name 1153 :type columns: str 1154 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1155 the number of lines to skip at the beginning of the file before reading the actual data. This 1156 parameter allows you to skip any header information present in the file before processing the 1157 data, defaults to 0 1158 :type header_len: int (optional) 1159 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1160 separator character that is used in the file being read. In this case, the default separator is 1161 set to `\t`, which represents a tab character. You can change this parameter to a different 1162 separator character if, defaults to \t 1163 :type sep: str (optional) 1164 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1165 when processing the file in chunks. In the provided code snippet, the default value for 1166 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1167 to 1000000 1168 :type chunksize: int (optional) 1169 """ 1170 1171 # Config 1172 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1173 connexion_format = self.get_connexion_format() 1174 1175 log.debug("chunksize: " + str(chunksize)) 1176 1177 if chunksize: 1178 for chunk in pd.read_csv( 1179 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1180 ): 1181 if connexion_format in ["duckdb"]: 1182 sql_insert_into = ( 1183 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1184 ) 1185 self.conn.execute(sql_insert_into) 1186 elif connexion_format in ["sqlite"]: 1187 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks and inserts each chunk into a table based on the specified database format.
Parameters
- file: The
fileparameter is the file that you want to load into a table. It should be the path to the file on your system - columns: The
columnsparameter in theinsert_file_to_tablefunction is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name - header_len: The
header_lenparameter in theinsert_file_to_tablefunction specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0 - sep: The
sepparameter in theinsert_file_to_tablefunction is used to specify the separator character that is used in the file being read. In this case, the default separator is set to, which represents a tab character. You can change this parameter to a different separator character if, defaults to - chunksize: The
chunksizeparameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value forchunksizeis set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
1189 def load_data( 1190 self, 1191 input_file: str = None, 1192 drop_variants_table: bool = False, 1193 sample_size: int = 20480, 1194 ) -> None: 1195 """ 1196 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1197 table before loading the data and specify a sample size. 1198 1199 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1200 table 1201 :type input_file: str 1202 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1203 determines whether the variants table should be dropped before loading the data. If set to 1204 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1205 not be dropped, defaults to False 1206 :type drop_variants_table: bool (optional) 1207 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1208 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1209 20480 1210 :type sample_size: int (optional) 1211 """ 1212 1213 log.info("Loading...") 1214 1215 # change input file 1216 if input_file: 1217 self.set_input(input_file) 1218 self.set_header() 1219 1220 # drop variants table 1221 if drop_variants_table: 1222 self.drop_variants_table() 1223 1224 # get table variants 1225 table_variants = self.get_table_variants() 1226 1227 # Access 1228 access = self.get_config().get("access", None) 1229 log.debug(f"access: {access}") 1230 1231 # Input format and compress 1232 input_format = self.get_input_format() 1233 input_compressed = self.get_input_compressed() 1234 log.debug(f"input_format: {input_format}") 1235 log.debug(f"input_compressed: {input_compressed}") 1236 1237 # input_compressed_format 1238 if input_compressed: 1239 input_compressed_format = "gzip" 1240 else: 1241 input_compressed_format = "none" 1242 log.debug(f"input_compressed_format: {input_compressed_format}") 1243 1244 # Connexion format 1245 connexion_format = self.get_connexion_format() 1246 1247 # Sample size 1248 if not sample_size: 1249 sample_size = -1 1250 log.debug(f"sample_size: {sample_size}") 1251 1252 # Load data 1253 log.debug(f"Load Data from {input_format}") 1254 1255 # DuckDB connexion 1256 if connexion_format in ["duckdb"]: 1257 1258 # Database already exists 1259 if self.input_format in ["db", "duckdb"]: 1260 1261 if connexion_format in ["duckdb"]: 1262 log.debug(f"Input file format '{self.input_format}' duckDB") 1263 else: 1264 log.error( 1265 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1266 ) 1267 raise ValueError( 1268 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1269 ) 1270 1271 # Load from existing database format 1272 else: 1273 1274 try: 1275 # Create Table or View 1276 database = Database(database=self.input) 1277 sql_from = database.get_sql_from(sample_size=sample_size) 1278 1279 if access in ["RO"]: 1280 sql_load = ( 1281 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1282 ) 1283 else: 1284 sql_load = ( 1285 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1286 ) 1287 self.conn.execute(sql_load) 1288 1289 except: 1290 # Format not available 1291 log.error(f"Input file format '{self.input_format}' not available") 1292 raise ValueError( 1293 f"Input file format '{self.input_format}' not available" 1294 ) 1295 1296 # SQLite connexion 1297 elif connexion_format in ["sqlite"] and input_format in [ 1298 "vcf", 1299 "tsv", 1300 "csv", 1301 "psv", 1302 ]: 1303 1304 # Main structure 1305 structure = { 1306 "#CHROM": "VARCHAR", 1307 "POS": "INTEGER", 1308 "ID": "VARCHAR", 1309 "REF": "VARCHAR", 1310 "ALT": "VARCHAR", 1311 "QUAL": "VARCHAR", 1312 "FILTER": "VARCHAR", 1313 "INFO": "VARCHAR", 1314 } 1315 1316 # Strcuture with samples 1317 structure_complete = structure 1318 if self.get_header_sample_list(): 1319 structure["FORMAT"] = "VARCHAR" 1320 for sample in self.get_header_sample_list(): 1321 structure_complete[sample] = "VARCHAR" 1322 1323 # Columns list for create and insert 1324 sql_create_table_columns = [] 1325 sql_create_table_columns_list = [] 1326 for column in structure_complete: 1327 column_type = structure_complete[column] 1328 sql_create_table_columns.append( 1329 f'"{column}" {column_type} default NULL' 1330 ) 1331 sql_create_table_columns_list.append(f'"{column}"') 1332 1333 # Create database 1334 log.debug(f"Create Table {table_variants}") 1335 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1336 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1337 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1338 self.conn.execute(sql_create_table) 1339 1340 # chunksize define length of file chunk load file 1341 chunksize = 100000 1342 1343 # delimiter 1344 delimiter = file_format_delimiters.get(input_format, "\t") 1345 1346 # Load the input file 1347 with open(self.input, "rt") as input_file: 1348 1349 # Use the appropriate file handler based on the input format 1350 if input_compressed: 1351 input_file = bgzf.open(self.input, "rt") 1352 if input_format in ["vcf"]: 1353 header_len = self.get_header_length() 1354 else: 1355 header_len = 0 1356 1357 # Insert the file contents into a table 1358 self.insert_file_to_table( 1359 input_file, 1360 columns=sql_create_table_columns_list_sql, 1361 header_len=header_len, 1362 sep=delimiter, 1363 chunksize=chunksize, 1364 ) 1365 1366 else: 1367 log.error( 1368 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1369 ) 1370 raise ValueError( 1371 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1372 ) 1373 1374 # Explode INFOS fields into table fields 1375 if self.get_explode_infos(): 1376 self.explode_infos( 1377 prefix=self.get_explode_infos_prefix(), 1378 fields=self.get_explode_infos_fields(), 1379 force=True, 1380 ) 1381 1382 # Create index after insertion 1383 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1385 def get_explode_infos(self) -> bool: 1386 """ 1387 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1388 to False if it is not set. 1389 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1390 value. If the parameter is not present, it will return False. 1391 """ 1392 1393 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1395 def get_explode_infos_fields( 1396 self, 1397 explode_infos_fields: str = None, 1398 remove_fields_not_in_header: bool = False, 1399 ) -> list: 1400 """ 1401 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1402 the input parameter `explode_infos_fields`. 1403 1404 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1405 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1406 comma-separated list of field names to explode 1407 :type explode_infos_fields: str 1408 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1409 flag that determines whether to remove fields that are not present in the header. If it is set 1410 to `True`, any field that is not in the header will be excluded from the list of exploded 1411 information fields. If it is set to `, defaults to False 1412 :type remove_fields_not_in_header: bool (optional) 1413 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1414 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1415 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1416 Otherwise, it returns a list of exploded information fields after removing any spaces and 1417 splitting the string by commas. 1418 """ 1419 1420 # If no fields, get it in param 1421 if not explode_infos_fields: 1422 explode_infos_fields = ( 1423 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1424 ) 1425 1426 # If no fields, defined as all fields in header using keyword 1427 if not explode_infos_fields: 1428 explode_infos_fields = "*" 1429 1430 # If fields list not empty 1431 if explode_infos_fields: 1432 1433 # Input fields list 1434 if isinstance(explode_infos_fields, str): 1435 fields_input = explode_infos_fields.split(",") 1436 elif isinstance(explode_infos_fields, list): 1437 fields_input = explode_infos_fields 1438 else: 1439 fields_input = [] 1440 1441 # Fields list without * keyword 1442 fields_without_all = fields_input.copy() 1443 if "*".casefold() in (item.casefold() for item in fields_without_all): 1444 fields_without_all.remove("*") 1445 1446 # Fields in header 1447 fields_in_header = sorted(list(set(self.get_header().infos))) 1448 1449 # Construct list of fields 1450 fields_output = [] 1451 for field in fields_input: 1452 1453 # Strip field 1454 field = field.strip() 1455 1456 # format keyword * in regex 1457 if field.upper() in ["*"]: 1458 field = ".*" 1459 1460 # Find all fields with pattern 1461 r = re.compile(field) 1462 fields_search = sorted(list(filter(r.match, fields_in_header))) 1463 1464 # Remove fields input from search 1465 if field in fields_search: 1466 fields_search = [field] 1467 elif fields_search != [field]: 1468 fields_search = sorted( 1469 list(set(fields_search).difference(fields_input)) 1470 ) 1471 1472 # If field is not in header (avoid not well formatted header) 1473 if not fields_search and not remove_fields_not_in_header: 1474 fields_search = [field] 1475 1476 # Add found fields 1477 for new_field in fields_search: 1478 # Add field, if not already exists, and if it is in header (if asked) 1479 if ( 1480 new_field not in fields_output 1481 and ( 1482 not remove_fields_not_in_header 1483 or new_field in fields_in_header 1484 ) 1485 and new_field not in [".*"] 1486 ): 1487 fields_output.append(new_field) 1488 1489 return fields_output 1490 1491 else: 1492 1493 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1495 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1496 """ 1497 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1498 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1499 not provided. 1500 1501 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1502 prefix to be used for exploding or expanding information 1503 :type explode_infos_prefix: str 1504 :return: the value of the variable `explode_infos_prefix`. 1505 """ 1506 1507 if not explode_infos_prefix: 1508 explode_infos_prefix = ( 1509 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1510 ) 1511 1512 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1514 def add_column( 1515 self, 1516 table_name, 1517 column_name, 1518 column_type, 1519 default_value=None, 1520 drop: bool = False, 1521 ) -> dict: 1522 """ 1523 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1524 doesn't already exist. 1525 1526 :param table_name: The name of the table to which you want to add a column 1527 :param column_name: The parameter "column_name" is the name of the column that you want to add 1528 to the table 1529 :param column_type: The `column_type` parameter specifies the data type of the column that you 1530 want to add to the table. It should be a string that represents the desired data type, such as 1531 "INTEGER", "TEXT", "REAL", etc 1532 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1533 default value for the newly added column. If a default value is provided, it will be assigned to 1534 the column for any existing rows that do not have a value for that column 1535 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1536 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1537 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1538 to False 1539 :type drop: bool (optional) 1540 :return: a boolean value indicating whether the column was successfully added to the table. 1541 """ 1542 1543 # added 1544 added = False 1545 dropped = False 1546 1547 # Check if the column already exists in the table 1548 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1549 columns = self.get_query_to_df(query).columns.tolist() 1550 if column_name.upper() in [c.upper() for c in columns]: 1551 log.debug( 1552 f"The {column_name} column already exists in the {table_name} table" 1553 ) 1554 if drop: 1555 self.drop_column(table_name=table_name, column_name=column_name) 1556 dropped = True 1557 else: 1558 return None 1559 else: 1560 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1561 1562 # Add column in table 1563 add_column_query = ( 1564 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1565 ) 1566 if default_value is not None: 1567 add_column_query += f" DEFAULT {default_value}" 1568 self.execute_query(add_column_query) 1569 added = not dropped 1570 log.debug( 1571 f"The {column_name} column was successfully added to the {table_name} table" 1572 ) 1573 1574 if added: 1575 added_column = { 1576 "table_name": table_name, 1577 "column_name": column_name, 1578 "column_type": column_type, 1579 "default_value": default_value, 1580 } 1581 else: 1582 added_column = None 1583 1584 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1586 def drop_column( 1587 self, column: dict = None, table_name: str = None, column_name: str = None 1588 ) -> bool: 1589 """ 1590 The `drop_column` function drops a specified column from a given table in a database and returns 1591 True if the column was successfully dropped, and False if the column does not exist in the 1592 table. 1593 1594 :param column: The `column` parameter is a dictionary that contains information about the column 1595 you want to drop. It has two keys: 1596 :type column: dict 1597 :param table_name: The `table_name` parameter is the name of the table from which you want to 1598 drop a column 1599 :type table_name: str 1600 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1601 from the table 1602 :type column_name: str 1603 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1604 and False if the column does not exist in the table. 1605 """ 1606 1607 # Find column infos 1608 if column: 1609 if isinstance(column, dict): 1610 table_name = column.get("table_name", None) 1611 column_name = column.get("column_name", None) 1612 elif isinstance(column, str): 1613 table_name = self.get_table_variants() 1614 column_name = column 1615 else: 1616 table_name = None 1617 column_name = None 1618 1619 if not table_name and not column_name: 1620 return False 1621 1622 # Removed 1623 removed = False 1624 1625 # Check if the column already exists in the table 1626 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1627 columns = self.get_query_to_df(query).columns.tolist() 1628 if column_name in columns: 1629 log.debug(f"The {column_name} column exists in the {table_name} table") 1630 else: 1631 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1632 return False 1633 1634 # Add column in table # ALTER TABLE integers DROP k 1635 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1636 self.execute_query(add_column_query) 1637 removed = True 1638 log.debug( 1639 f"The {column_name} column was successfully dropped to the {table_name} table" 1640 ) 1641 1642 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1644 def explode_infos( 1645 self, 1646 prefix: str = None, 1647 create_index: bool = False, 1648 fields: list = None, 1649 force: bool = False, 1650 proccess_all_fields_together: bool = False, 1651 table: str = None, 1652 ) -> list: 1653 """ 1654 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1655 individual columns, returning a list of added columns. 1656 1657 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1658 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1659 `self.get_explode_infos_prefix()` as the prefix 1660 :type prefix: str 1661 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1662 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1663 `False`, indexes will not be created. The default value is `False`, defaults to False 1664 :type create_index: bool (optional) 1665 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1666 that you want to explode into individual columns. If this parameter is not provided, all INFO 1667 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1668 a list to the ` 1669 :type fields: list 1670 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1671 determines whether to drop and recreate a column if it already exists in the table. If `force` 1672 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1673 defaults to False 1674 :type force: bool (optional) 1675 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1676 flag that determines whether to process all the INFO fields together or individually. If set to 1677 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1678 be processed individually. The default value is, defaults to False 1679 :type proccess_all_fields_together: bool (optional) 1680 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1681 of the table where the exploded INFO fields will be added as individual columns. If you provide 1682 a value for the `table` parameter, the function will use that table name. If the `table` 1683 parameter is 1684 :type table: str 1685 :return: The `explode_infos` function returns a list of added columns. 1686 """ 1687 1688 # drop indexes 1689 self.drop_indexes() 1690 1691 # connexion format 1692 connexion_format = self.get_connexion_format() 1693 1694 # Access 1695 access = self.get_config().get("access", None) 1696 1697 # Added columns 1698 added_columns = [] 1699 1700 if access not in ["RO"]: 1701 1702 # prefix 1703 if prefix in [None, True] or not isinstance(prefix, str): 1704 if self.get_explode_infos_prefix() not in [None, True]: 1705 prefix = self.get_explode_infos_prefix() 1706 else: 1707 prefix = "INFO/" 1708 1709 # table variants 1710 if table is not None: 1711 table_variants = table 1712 else: 1713 table_variants = self.get_table_variants(clause="select") 1714 1715 # extra infos 1716 try: 1717 extra_infos = self.get_extra_infos() 1718 except: 1719 extra_infos = [] 1720 1721 # Header infos 1722 header_infos = self.get_header().infos 1723 1724 log.debug( 1725 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1726 ) 1727 1728 sql_info_alter_table_array = [] 1729 1730 # Info fields to check 1731 fields_list = list(header_infos) 1732 if fields: 1733 fields_list += fields 1734 fields_list = set(fields_list) 1735 1736 # If no fields 1737 if not fields: 1738 fields = [] 1739 1740 # Translate fields if patterns 1741 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1742 1743 for info in fields: 1744 1745 info_id_sql = prefix + info 1746 1747 if ( 1748 info in fields_list 1749 or prefix + info in fields_list 1750 or info in extra_infos 1751 ): 1752 1753 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1754 1755 if info in header_infos: 1756 info_type = header_infos[info].type 1757 info_num = header_infos[info].num 1758 else: 1759 info_type = "String" 1760 info_num = 0 1761 1762 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1763 if info_num != 1: 1764 type_sql = "VARCHAR" 1765 1766 # Add field 1767 added_column = self.add_column( 1768 table_name=table_variants, 1769 column_name=info_id_sql, 1770 column_type=type_sql, 1771 default_value="null", 1772 drop=force, 1773 ) 1774 1775 if added_column: 1776 added_columns.append(added_column) 1777 1778 if added_column or force: 1779 1780 # add field to index 1781 self.index_additionnal_fields.append(info_id_sql) 1782 1783 # Update field array 1784 if connexion_format in ["duckdb"]: 1785 update_info_field = f""" 1786 "{info_id_sql}" = 1787 CASE 1788 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1789 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1790 END 1791 """ 1792 elif connexion_format in ["sqlite"]: 1793 update_info_field = f""" 1794 "{info_id_sql}" = 1795 CASE 1796 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1797 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1798 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1799 END 1800 """ 1801 1802 sql_info_alter_table_array.append(update_info_field) 1803 1804 if sql_info_alter_table_array: 1805 1806 # By chromosomes 1807 try: 1808 chromosomes_list = list( 1809 self.get_query_to_df( 1810 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1811 )["#CHROM"] 1812 ) 1813 except: 1814 chromosomes_list = [None] 1815 1816 for chrom in chromosomes_list: 1817 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1818 1819 # Where clause 1820 where_clause = "" 1821 if chrom and len(chromosomes_list) > 1: 1822 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1823 1824 # Update table 1825 if proccess_all_fields_together: 1826 sql_info_alter_table_array_join = ", ".join( 1827 sql_info_alter_table_array 1828 ) 1829 if sql_info_alter_table_array_join: 1830 sql_info_alter_table = f""" 1831 UPDATE {table_variants} 1832 SET {sql_info_alter_table_array_join} 1833 {where_clause} 1834 """ 1835 log.debug( 1836 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1837 ) 1838 # log.debug(sql_info_alter_table) 1839 self.conn.execute(sql_info_alter_table) 1840 else: 1841 sql_info_alter_num = 0 1842 for sql_info_alter in sql_info_alter_table_array: 1843 sql_info_alter_num += 1 1844 sql_info_alter_table = f""" 1845 UPDATE {table_variants} 1846 SET {sql_info_alter} 1847 {where_clause} 1848 """ 1849 log.debug( 1850 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1851 ) 1852 # log.debug(sql_info_alter_table) 1853 self.conn.execute(sql_info_alter_table) 1854 1855 # create indexes 1856 if create_index: 1857 self.create_indexes() 1858 1859 return added_columns
The explode_infos function in Python takes a VCF file and explodes the INFO fields into
individual columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter in theexplode_infosfunction is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the ` - force: The
forceparameter in theexplode_infosfunction is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set to `False, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually. The default value is, defaults to False - table: The
tableparameter in theexplode_infosfunction is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for thetableparameter, the function will use that table name. If thetableparameter is
Returns
The
explode_infosfunction returns a list of added columns.
1861 def create_indexes(self) -> None: 1862 """ 1863 Create indexes on the table after insertion 1864 """ 1865 1866 # Access 1867 access = self.get_config().get("access", None) 1868 1869 # get table variants 1870 table_variants = self.get_table_variants("FROM") 1871 1872 if self.get_indexing() and access not in ["RO"]: 1873 # Create index 1874 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1875 self.conn.execute(sql_create_table_index) 1876 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1877 self.conn.execute(sql_create_table_index) 1878 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1879 self.conn.execute(sql_create_table_index) 1880 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1881 self.conn.execute(sql_create_table_index) 1882 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1883 self.conn.execute(sql_create_table_index) 1884 for field in self.index_additionnal_fields: 1885 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1886 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
1888 def drop_indexes(self) -> None: 1889 """ 1890 Create indexes on the table after insertion 1891 """ 1892 1893 # Access 1894 access = self.get_config().get("access", None) 1895 1896 # get table variants 1897 table_variants = self.get_table_variants("FROM") 1898 1899 # Get database format 1900 connexion_format = self.get_connexion_format() 1901 1902 if access not in ["RO"]: 1903 if connexion_format in ["duckdb"]: 1904 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1905 elif connexion_format in ["sqlite"]: 1906 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1907 1908 list_indexes = self.conn.execute(sql_list_indexes) 1909 index_names = [row[0] for row in list_indexes.fetchall()] 1910 for index in index_names: 1911 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1912 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
1914 def read_vcf_header(self, f) -> list: 1915 """ 1916 It reads the header of a VCF file and returns a list of the header lines 1917 1918 :param f: the file object 1919 :return: The header lines of the VCF file. 1920 """ 1921 1922 header_list = [] 1923 for line in f: 1924 header_list.append(line) 1925 if line.startswith("#CHROM"): 1926 break 1927 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
1929 def read_vcf_header_file(self, file: str = None) -> list: 1930 """ 1931 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 1932 uncompressed files. 1933 1934 :param file: The `file` parameter is a string that represents the path to the VCF header file 1935 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1936 default to `None` 1937 :type file: str 1938 :return: The function `read_vcf_header_file` returns a list. 1939 """ 1940 1941 if self.get_input_compressed(input_file=file): 1942 with bgzf.open(file, "rt") as f: 1943 return self.read_vcf_header(f=f) 1944 else: 1945 with open(file, "rt") as f: 1946 return self.read_vcf_header(f=f)
The read_vcf_header_file function reads the header of a VCF file, handling both compressed and
uncompressed files.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone
Returns
The function
read_vcf_header_filereturns a list.
1948 def execute_query(self, query: str): 1949 """ 1950 It takes a query as an argument, executes it, and returns the results 1951 1952 :param query: The query to be executed 1953 :return: The result of the query is being returned. 1954 """ 1955 if query: 1956 return self.conn.execute(query) # .fetchall() 1957 else: 1958 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
1960 def export_output( 1961 self, 1962 output_file: str | None = None, 1963 output_header: str | None = None, 1964 export_header: bool = True, 1965 query: str | None = None, 1966 parquet_partitions: list | None = None, 1967 chunk_size: int | None = None, 1968 threads: int | None = None, 1969 sort: bool = False, 1970 index: bool = False, 1971 order_by: str | None = None, 1972 ) -> bool: 1973 """ 1974 The `export_output` function exports data from a VCF file to a specified output file in various 1975 formats, including VCF, CSV, TSV, PSV, and Parquet. 1976 1977 :param output_file: The `output_file` parameter is a string that specifies the name of the 1978 output file to be generated by the function. This is where the exported data will be saved 1979 :type output_file: str 1980 :param output_header: The `output_header` parameter is a string that specifies the name of the 1981 file where the header of the VCF file will be exported. If this parameter is not provided, the 1982 header will be exported to a file with the same name as the `output_file` parameter, but with 1983 the extension " 1984 :type output_header: str 1985 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1986 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1987 True, the header will be exported to a file. If `export_header` is False, the header will not 1988 be, defaults to True, if output format is not VCF 1989 :type export_header: bool (optional) 1990 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1991 select specific data from the VCF file before exporting it. If provided, only the data that 1992 matches the query will be exported 1993 :type query: str 1994 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1995 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1996 organize data in a hierarchical directory structure based on the values of one or more columns. 1997 This can improve query performance when working with large datasets 1998 :type parquet_partitions: list 1999 :param chunk_size: The `chunk_size` parameter specifies the number of 2000 records in batch when exporting data in Parquet format. This parameter is used for 2001 partitioning the Parquet file into multiple files. 2002 :type chunk_size: int 2003 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2004 threads to be used during the export process. It determines the level of parallelism and can 2005 improve the performance of the export operation. If not provided, the function will use the 2006 default number of threads 2007 :type threads: int 2008 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2009 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2010 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2011 False 2012 :type sort: bool (optional) 2013 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2014 created on the output file. If `index` is True, an index will be created. If `index` is False, 2015 no index will be created. The default value is False, defaults to False 2016 :type index: bool (optional) 2017 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2018 sorting the output file. This parameter is only applicable when exporting data in VCF format 2019 :type order_by: str 2020 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2021 None if it doesn't. 2022 """ 2023 2024 # Log 2025 log.info("Exporting...") 2026 2027 # Full path 2028 output_file = full_path(output_file) 2029 output_header = full_path(output_header) 2030 2031 # Config 2032 config = self.get_config() 2033 2034 # Param 2035 param = self.get_param() 2036 2037 # Tmp files to remove 2038 tmp_to_remove = [] 2039 2040 # If no output, get it 2041 if not output_file: 2042 output_file = self.get_output() 2043 2044 # If not threads 2045 if not threads: 2046 threads = self.get_threads() 2047 2048 # Auto header name with extension 2049 if export_header or output_header: 2050 if not output_header: 2051 output_header = f"{output_file}.hdr" 2052 # Export header 2053 self.export_header(output_file=output_file) 2054 2055 # Switch off export header if VCF output 2056 output_file_type = get_file_format(output_file) 2057 if output_file_type in ["vcf"]: 2058 export_header = False 2059 tmp_to_remove.append(output_header) 2060 2061 # Chunk size 2062 if not chunk_size: 2063 chunk_size = config.get("chunk_size", None) 2064 2065 # Parquet partition 2066 if not parquet_partitions: 2067 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2068 if parquet_partitions and isinstance(parquet_partitions, str): 2069 parquet_partitions = parquet_partitions.split(",") 2070 2071 # Order by 2072 if not order_by: 2073 order_by = param.get("export", {}).get("order_by", "") 2074 2075 # Header in output 2076 header_in_output = param.get("export", {}).get("include_header", False) 2077 2078 # Database 2079 database_source = self.get_connexion() 2080 2081 # Connexion format 2082 connexion_format = self.get_connexion_format() 2083 2084 # Explode infos 2085 if self.get_explode_infos(): 2086 self.explode_infos( 2087 prefix=self.get_explode_infos_prefix(), 2088 fields=self.get_explode_infos_fields(), 2089 force=False, 2090 ) 2091 2092 # if connexion_format in ["sqlite"] or query: 2093 if connexion_format in ["sqlite"]: 2094 2095 # Export in Parquet 2096 random_tmp = "".join( 2097 random.choice(string.ascii_lowercase) for i in range(10) 2098 ) 2099 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2100 tmp_to_remove.append(database_source) 2101 2102 # Table Variants 2103 table_variants = self.get_table_variants() 2104 2105 # Create export query 2106 sql_query_export_subquery = f""" 2107 SELECT * FROM {table_variants} 2108 """ 2109 2110 # Write source file 2111 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2112 2113 # Create database 2114 database = Database( 2115 database=database_source, 2116 table="variants", 2117 header_file=output_header, 2118 conn_config=self.get_connexion_config(), 2119 ) 2120 2121 # Existing colomns header 2122 # existing_columns_header = database.get_header_file_columns(output_header) 2123 existing_columns_header = database.get_header_columns_from_database() 2124 2125 # Export file 2126 database.export( 2127 output_database=output_file, 2128 output_header=output_header, 2129 existing_columns_header=existing_columns_header, 2130 parquet_partitions=parquet_partitions, 2131 chunk_size=chunk_size, 2132 threads=threads, 2133 sort=sort, 2134 index=index, 2135 header_in_output=header_in_output, 2136 order_by=order_by, 2137 query=query, 2138 export_header=export_header, 2139 ) 2140 2141 # Remove 2142 remove_if_exists(tmp_to_remove) 2143 2144 return (os.path.exists(output_file) or None) and ( 2145 os.path.exists(output_file) or None 2146 )
The export_output function exports data from a VCF file to a specified output file in various
formats, including VCF, CSV, TSV, PSV, and Parquet.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True, if output format is not VCF - query: The
queryparameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. - threads: The
threadsparameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads - sort: The
sortparameter is a boolean flag that determines whether the output file should be sorted or not. Ifsortis set toTrue, the output file will be sorted based on the genomic coordinates of the variants. By default, the value ofsortisFalse, defaults to False - index: The
indexparameter is a boolean flag that determines whether an index should be created on the output file. Ifindexis True, an index will be created. Ifindexis False, no index will be created. The default value is False, defaults to False - order_by: The
order_byparameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns
a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2148 def get_extra_infos(self, table: str = None) -> list: 2149 """ 2150 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2151 in the header. 2152 2153 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2154 name of the table from which you want to retrieve the extra columns that are not present in the 2155 header. If the `table` parameter is not provided when calling the function, it will default to 2156 using the variants 2157 :type table: str 2158 :return: A list of columns that are in the specified table but not in the header of the table. 2159 """ 2160 2161 header_columns = [] 2162 2163 if not table: 2164 table = self.get_table_variants(clause="from") 2165 header_columns = self.get_header_columns() 2166 2167 # Check all columns in the database 2168 query = f""" SELECT * FROM {table} LIMIT 1 """ 2169 log.debug(f"query {query}") 2170 table_columns = self.get_query_to_df(query).columns.tolist() 2171 extra_columns = [] 2172 2173 # Construct extra infos (not in header) 2174 for column in table_columns: 2175 if column not in header_columns: 2176 extra_columns.append(column) 2177 2178 return extra_columns
The get_extra_infos function returns a list of columns that are in a specified table but not
in the header.
Parameters
- table: The
tableparameter in theget_extra_infosfunction is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If thetableparameter is not provided when calling the function, it will default to using the variants
Returns
A list of columns that are in the specified table but not in the header of the table.
2180 def get_extra_infos_sql(self, table: str = None) -> str: 2181 """ 2182 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2183 by double quotes 2184 2185 :param table: The name of the table to get the extra infos from. If None, the default table is 2186 used 2187 :type table: str 2188 :return: A string of the extra infos 2189 """ 2190 2191 return ", ".join( 2192 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2193 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2195 def export_header( 2196 self, 2197 header_name: str = None, 2198 output_file: str = None, 2199 output_file_ext: str = ".hdr", 2200 clean_header: bool = True, 2201 remove_chrom_line: bool = False, 2202 ) -> str: 2203 """ 2204 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2205 specified options, and writes it to a new file. 2206 2207 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2208 this parameter is not specified, the header will be written to the output file 2209 :type header_name: str 2210 :param output_file: The `output_file` parameter in the `export_header` function is used to 2211 specify the name of the output file where the header will be written. If this parameter is not 2212 provided, the header will be written to a temporary file 2213 :type output_file: str 2214 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2215 string that represents the extension of the output header file. By default, it is set to ".hdr" 2216 if not specified by the user. This extension will be appended to the `output_file` name to 2217 create the final, defaults to .hdr 2218 :type output_file_ext: str (optional) 2219 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2220 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2221 `True`, the function will clean the header by modifying certain lines based on a specific 2222 pattern. If `clean_header`, defaults to True 2223 :type clean_header: bool (optional) 2224 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2225 boolean flag that determines whether the #CHROM line should be removed from the header before 2226 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2227 defaults to False 2228 :type remove_chrom_line: bool (optional) 2229 :return: The function `export_header` returns the name of the temporary header file that is 2230 created. 2231 """ 2232 2233 if not header_name and not output_file: 2234 output_file = self.get_output() 2235 2236 if self.get_header(): 2237 2238 # Get header object 2239 header_obj = self.get_header() 2240 2241 # Create database 2242 db_for_header = Database(database=self.get_input()) 2243 2244 # Get real columns in the file 2245 db_header_columns = db_for_header.get_columns() 2246 2247 with tempfile.TemporaryDirectory() as tmpdir: 2248 2249 # Write header file 2250 header_file_tmp = os.path.join(tmpdir, "header") 2251 f = open(header_file_tmp, "w") 2252 vcf.Writer(f, header_obj) 2253 f.close() 2254 2255 # Replace #CHROM line with rel columns 2256 header_list = db_for_header.read_header_file( 2257 header_file=header_file_tmp 2258 ) 2259 header_list[-1] = "\t".join(db_header_columns) 2260 2261 # Remove CHROM line 2262 if remove_chrom_line: 2263 header_list.pop() 2264 2265 # Clean header 2266 if clean_header: 2267 header_list_clean = [] 2268 for head in header_list: 2269 # Clean head for malformed header 2270 head_clean = head 2271 head_clean = re.subn( 2272 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2273 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2274 head_clean, 2275 2, 2276 )[0] 2277 # Write header 2278 header_list_clean.append(head_clean) 2279 header_list = header_list_clean 2280 2281 tmp_header_name = output_file + output_file_ext 2282 2283 f = open(tmp_header_name, "w") 2284 for line in header_list: 2285 f.write(line) 2286 f.close() 2287 2288 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2290 def export_variant_vcf( 2291 self, 2292 vcf_file, 2293 remove_info: bool = False, 2294 add_samples: bool = True, 2295 list_samples: list = [], 2296 where_clause: str = "", 2297 index: bool = False, 2298 threads: int | None = None, 2299 ) -> bool | None: 2300 """ 2301 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2302 remove INFO field, add samples, and control compression and indexing. 2303 2304 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2305 written to. It is the output file that will contain the filtered VCF data based on the specified 2306 parameters 2307 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2308 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2309 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2310 in, defaults to False 2311 :type remove_info: bool (optional) 2312 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2313 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2314 If set to False, the samples will be removed. The default value is True, defaults to True 2315 :type add_samples: bool (optional) 2316 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2317 in the output VCF file. By default, all samples will be included. If you provide a list of 2318 samples, only those samples will be included in the output file 2319 :type list_samples: list 2320 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2321 determines whether or not to create an index for the output VCF file. If `index` is set to 2322 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2323 :type index: bool (optional) 2324 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2325 number of threads to use for exporting the VCF file. It determines how many parallel threads 2326 will be used during the export process. More threads can potentially speed up the export process 2327 by utilizing multiple cores of the processor. If 2328 :type threads: int | None 2329 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2330 method with various parameters including the output file, query, threads, sort flag, and index 2331 flag. The `export_output` method is responsible for exporting the VCF data based on the 2332 specified parameters and configurations provided in the `export_variant_vcf` function. 2333 """ 2334 2335 # Config 2336 config = self.get_config() 2337 2338 # Extract VCF 2339 log.debug("Export VCF...") 2340 2341 # Table variants 2342 table_variants = self.get_table_variants() 2343 2344 # Threads 2345 if not threads: 2346 threads = self.get_threads() 2347 2348 # Info fields 2349 if remove_info: 2350 if not isinstance(remove_info, str): 2351 remove_info = "." 2352 info_field = f"""'{remove_info}' as INFO""" 2353 else: 2354 info_field = "INFO" 2355 2356 # Samples fields 2357 if add_samples: 2358 if not list_samples: 2359 list_samples = self.get_header_sample_list() 2360 if list_samples: 2361 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2362 else: 2363 samples_fields = "" 2364 log.debug(f"samples_fields: {samples_fields}") 2365 else: 2366 samples_fields = "" 2367 2368 # Where clause 2369 if where_clause is None: 2370 where_clause = "" 2371 2372 # Variants 2373 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2374 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2375 log.debug(f"sql_query_select={sql_query_select}") 2376 2377 return self.export_output( 2378 output_file=vcf_file, 2379 output_header=None, 2380 export_header=True, 2381 query=sql_query_select, 2382 parquet_partitions=None, 2383 chunk_size=config.get("chunk_size", None), 2384 threads=threads, 2385 sort=True, 2386 index=index, 2387 order_by=None, 2388 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2390 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2391 """ 2392 It takes a list of commands and runs them in parallel using the number of threads specified 2393 2394 :param commands: A list of commands to run 2395 :param threads: The number of threads to use, defaults to 1 (optional) 2396 """ 2397 2398 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2400 def get_threads(self, default: int = 1) -> int: 2401 """ 2402 This function returns the number of threads to use for a job, with a default value of 1 if not 2403 specified. 2404 2405 :param default: The `default` parameter in the `get_threads` method is used to specify the 2406 default number of threads to use if no specific value is provided. If no value is provided for 2407 the `threads` parameter in the configuration or input parameters, the `default` value will be 2408 used, defaults to 1 2409 :type default: int (optional) 2410 :return: the number of threads to use for the current job. 2411 """ 2412 2413 # Config 2414 config = self.get_config() 2415 2416 # Param 2417 param = self.get_param() 2418 2419 # Input threads 2420 input_thread = param.get("threads", config.get("threads", None)) 2421 2422 # Check threads 2423 if not input_thread: 2424 threads = default 2425 elif int(input_thread) <= 0: 2426 threads = os.cpu_count() 2427 else: 2428 threads = int(input_thread) 2429 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2431 def get_memory(self, default: str = None) -> str: 2432 """ 2433 This function retrieves the memory value from parameters or configuration with a default value 2434 if not found. 2435 2436 :param default: The `get_memory` function takes in a default value as a string parameter. This 2437 default value is used as a fallback in case the `memory` parameter is not provided in the 2438 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2439 the function 2440 :type default: str 2441 :return: The `get_memory` function returns a string value representing the memory parameter. If 2442 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2443 return the default value provided as an argument to the function. 2444 """ 2445 2446 # Config 2447 config = self.get_config() 2448 2449 # Param 2450 param = self.get_param() 2451 2452 # Input threads 2453 input_memory = param.get("memory", config.get("memory", None)) 2454 2455 # Check threads 2456 if input_memory: 2457 memory = input_memory 2458 else: 2459 memory = default 2460 2461 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2463 def update_from_vcf(self, vcf_file: str) -> None: 2464 """ 2465 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2466 2467 :param vcf_file: the path to the VCF file 2468 """ 2469 2470 connexion_format = self.get_connexion_format() 2471 2472 if connexion_format in ["duckdb"]: 2473 self.update_from_vcf_duckdb(vcf_file) 2474 elif connexion_format in ["sqlite"]: 2475 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2477 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2478 """ 2479 It takes a VCF file and updates the INFO column of the variants table in the database with the 2480 INFO column of the VCF file 2481 2482 :param vcf_file: the path to the VCF file 2483 """ 2484 2485 # varaints table 2486 table_variants = self.get_table_variants() 2487 2488 # Loading VCF into temporaire table 2489 skip = self.get_header_length(file=vcf_file) 2490 vcf_df = pd.read_csv( 2491 vcf_file, 2492 sep="\t", 2493 engine="c", 2494 skiprows=skip, 2495 header=0, 2496 low_memory=False, 2497 ) 2498 sql_query_update = f""" 2499 UPDATE {table_variants} as table_variants 2500 SET INFO = concat( 2501 CASE 2502 WHEN INFO NOT IN ('', '.') 2503 THEN INFO 2504 ELSE '' 2505 END, 2506 ( 2507 SELECT 2508 concat( 2509 CASE 2510 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2511 THEN ';' 2512 ELSE '' 2513 END 2514 , 2515 CASE 2516 WHEN table_parquet.INFO NOT IN ('','.') 2517 THEN table_parquet.INFO 2518 ELSE '' 2519 END 2520 ) 2521 FROM vcf_df as table_parquet 2522 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2523 AND table_parquet.\"POS\" = table_variants.\"POS\" 2524 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2525 AND table_parquet.\"REF\" = table_variants.\"REF\" 2526 AND table_parquet.INFO NOT IN ('','.') 2527 ) 2528 ) 2529 ; 2530 """ 2531 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2533 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2534 """ 2535 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2536 table, then updates the INFO column of the variants table with the INFO column of the temporary 2537 table 2538 2539 :param vcf_file: The path to the VCF file you want to update the database with 2540 """ 2541 2542 # Create a temporary table for the VCF 2543 table_vcf = "tmp_vcf" 2544 sql_create = ( 2545 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2546 ) 2547 self.conn.execute(sql_create) 2548 2549 # Loading VCF into temporaire table 2550 vcf_df = pd.read_csv( 2551 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2552 ) 2553 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2554 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2555 2556 # Update table 'variants' with VCF data 2557 # warning: CONCAT as || operator 2558 sql_query_update = f""" 2559 UPDATE variants as table_variants 2560 SET INFO = CASE 2561 WHEN INFO NOT IN ('', '.') 2562 THEN INFO 2563 ELSE '' 2564 END || 2565 ( 2566 SELECT 2567 CASE 2568 WHEN table_variants.INFO NOT IN ('','.') 2569 AND table_vcf.INFO NOT IN ('','.') 2570 THEN ';' 2571 ELSE '' 2572 END || 2573 CASE 2574 WHEN table_vcf.INFO NOT IN ('','.') 2575 THEN table_vcf.INFO 2576 ELSE '' 2577 END 2578 FROM {table_vcf} as table_vcf 2579 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2580 AND table_vcf.\"POS\" = table_variants.\"POS\" 2581 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2582 AND table_vcf.\"REF\" = table_variants.\"REF\" 2583 ) 2584 """ 2585 self.conn.execute(sql_query_update) 2586 2587 # Drop temporary table 2588 sql_drop = f"DROP TABLE {table_vcf}" 2589 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2591 def drop_variants_table(self) -> None: 2592 """ 2593 > This function drops the variants table 2594 """ 2595 2596 table_variants = self.get_table_variants() 2597 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2598 self.conn.execute(sql_table_variants)
This function drops the variants table
2600 def set_variant_id( 2601 self, variant_id_column: str = "variant_id", force: bool = None 2602 ) -> str: 2603 """ 2604 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2605 `#CHROM`, `POS`, `REF`, and `ALT` columns 2606 2607 :param variant_id_column: The name of the column to be created in the variants table, defaults 2608 to variant_id 2609 :type variant_id_column: str (optional) 2610 :param force: If True, the variant_id column will be created even if it already exists 2611 :type force: bool 2612 :return: The name of the column that contains the variant_id 2613 """ 2614 2615 # Assembly 2616 assembly = self.get_param().get( 2617 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2618 ) 2619 2620 # INFO/Tag prefix 2621 prefix = self.get_explode_infos_prefix() 2622 2623 # Explode INFO/SVTYPE 2624 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2625 2626 # variants table 2627 table_variants = self.get_table_variants() 2628 2629 # variant_id column 2630 if not variant_id_column: 2631 variant_id_column = "variant_id" 2632 2633 # Creta variant_id column 2634 if "variant_id" not in self.get_extra_infos() or force: 2635 2636 # Create column 2637 self.add_column( 2638 table_name=table_variants, 2639 column_name=variant_id_column, 2640 column_type="UBIGINT", 2641 default_value="0", 2642 ) 2643 2644 # Update column 2645 self.conn.execute( 2646 f""" 2647 UPDATE {table_variants} 2648 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2649 """ 2650 ) 2651 2652 # Remove added columns 2653 for added_column in added_columns: 2654 self.drop_column(column=added_column) 2655 2656 # return variant_id column name 2657 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2659 def get_variant_id_column( 2660 self, variant_id_column: str = "variant_id", force: bool = None 2661 ) -> str: 2662 """ 2663 This function returns the variant_id column name 2664 2665 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2666 defaults to variant_id 2667 :type variant_id_column: str (optional) 2668 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2669 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2670 if it is not already set, or if it is set 2671 :type force: bool 2672 :return: The variant_id column name. 2673 """ 2674 2675 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2681 def scan_databases( 2682 self, 2683 database_formats: list = ["parquet"], 2684 database_releases: list = ["current"], 2685 ) -> dict: 2686 """ 2687 The function `scan_databases` scans for available databases based on specified formats and 2688 releases. 2689 2690 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2691 of the databases to be scanned. In this case, the accepted format is "parquet" 2692 :type database_formats: list ["parquet"] 2693 :param database_releases: The `database_releases` parameter is a list that specifies the 2694 releases of the databases to be scanned. In the provided function, the default value for 2695 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2696 databases that are in the "current" 2697 :type database_releases: list 2698 :return: The function `scan_databases` returns a dictionary containing information about 2699 databases that match the specified formats and releases. 2700 """ 2701 2702 # Config 2703 config = self.get_config() 2704 2705 # Param 2706 param = self.get_param() 2707 2708 # Param - Assembly 2709 assembly = param.get("assembly", config.get("assembly", None)) 2710 if not assembly: 2711 assembly = DEFAULT_ASSEMBLY 2712 log.warning(f"Default assembly '{assembly}'") 2713 2714 # Scan for availabled databases 2715 log.info( 2716 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2717 ) 2718 databases_infos_dict = databases_infos( 2719 database_folder_releases=database_releases, 2720 database_formats=database_formats, 2721 assembly=assembly, 2722 config=config, 2723 ) 2724 log.info( 2725 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2726 ) 2727 2728 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2730 def annotation(self) -> None: 2731 """ 2732 It annotates the VCF file with the annotations specified in the config file. 2733 """ 2734 2735 # Config 2736 config = self.get_config() 2737 2738 # Param 2739 param = self.get_param() 2740 2741 # Param - Assembly 2742 assembly = param.get("assembly", config.get("assembly", None)) 2743 if not assembly: 2744 assembly = DEFAULT_ASSEMBLY 2745 log.warning(f"Default assembly '{assembly}'") 2746 2747 # annotations databases folders 2748 annotations_databases = set( 2749 config.get("folders", {}) 2750 .get("databases", {}) 2751 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2752 + config.get("folders", {}) 2753 .get("databases", {}) 2754 .get("parquet", ["~/howard/databases/parquet/current"]) 2755 + config.get("folders", {}) 2756 .get("databases", {}) 2757 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2758 ) 2759 2760 # Get param annotations 2761 if param.get("annotations", None) and isinstance( 2762 param.get("annotations", None), str 2763 ): 2764 log.debug(param.get("annotations", None)) 2765 param_annotation_list = param.get("annotations").split(",") 2766 else: 2767 param_annotation_list = [] 2768 2769 # Each tools param 2770 if param.get("annotation_parquet", None) != None: 2771 log.debug( 2772 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2773 ) 2774 if isinstance(param.get("annotation_parquet", None), list): 2775 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2776 else: 2777 param_annotation_list.append(param.get("annotation_parquet")) 2778 if param.get("annotation_snpsift", None) != None: 2779 if isinstance(param.get("annotation_snpsift", None), list): 2780 param_annotation_list.append( 2781 "snpsift:" 2782 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2783 ) 2784 else: 2785 param_annotation_list.append( 2786 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2787 ) 2788 if param.get("annotation_snpeff", None) != None: 2789 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2790 if param.get("annotation_bcftools", None) != None: 2791 if isinstance(param.get("annotation_bcftools", None), list): 2792 param_annotation_list.append( 2793 "bcftools:" 2794 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2795 ) 2796 else: 2797 param_annotation_list.append( 2798 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2799 ) 2800 if param.get("annotation_annovar", None) != None: 2801 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2802 if param.get("annotation_exomiser", None) != None: 2803 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2804 if param.get("annotation_splice", None) != None: 2805 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2806 2807 # Merge param annotations list 2808 param["annotations"] = ",".join(param_annotation_list) 2809 2810 # debug 2811 log.debug(f"param_annotations={param['annotations']}") 2812 2813 if param.get("annotations"): 2814 2815 # Log 2816 # log.info("Annotations - Check annotation parameters") 2817 2818 if not "annotation" in param: 2819 param["annotation"] = {} 2820 2821 # List of annotations parameters 2822 annotations_list_input = {} 2823 if isinstance(param.get("annotations", None), str): 2824 annotation_file_list = [ 2825 value for value in param.get("annotations", "").split(",") 2826 ] 2827 for annotation_file in annotation_file_list: 2828 annotations_list_input[annotation_file] = {"INFO": None} 2829 else: 2830 annotations_list_input = param.get("annotations", {}) 2831 2832 log.info(f"Quick Annotations:") 2833 for annotation_key in list(annotations_list_input.keys()): 2834 log.info(f" {annotation_key}") 2835 2836 # List of annotations and associated fields 2837 annotations_list = {} 2838 2839 for annotation_file in annotations_list_input: 2840 2841 # Explode annotations if ALL 2842 if ( 2843 annotation_file.upper() == "ALL" 2844 or annotation_file.upper().startswith("ALL:") 2845 ): 2846 2847 # check ALL parameters (formats, releases) 2848 annotation_file_split = annotation_file.split(":") 2849 database_formats = "parquet" 2850 database_releases = "current" 2851 for annotation_file_option in annotation_file_split[1:]: 2852 database_all_options_split = annotation_file_option.split("=") 2853 if database_all_options_split[0] == "format": 2854 database_formats = database_all_options_split[1].split("+") 2855 if database_all_options_split[0] == "release": 2856 database_releases = database_all_options_split[1].split("+") 2857 2858 # Scan for availabled databases 2859 databases_infos_dict = self.scan_databases( 2860 database_formats=database_formats, 2861 database_releases=database_releases, 2862 ) 2863 2864 # Add found databases in annotation parameters 2865 for database_infos in databases_infos_dict.keys(): 2866 annotations_list[database_infos] = {"INFO": None} 2867 2868 else: 2869 annotations_list[annotation_file] = annotations_list_input[ 2870 annotation_file 2871 ] 2872 2873 # Check each databases 2874 if len(annotations_list): 2875 2876 log.info( 2877 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2878 ) 2879 2880 for annotation_file in annotations_list: 2881 2882 # Init 2883 annotations = annotations_list.get(annotation_file, None) 2884 2885 # Annotation snpEff 2886 if annotation_file.startswith("snpeff"): 2887 2888 log.debug(f"Quick Annotation snpEff") 2889 2890 if "snpeff" not in param["annotation"]: 2891 param["annotation"]["snpeff"] = {} 2892 2893 if "options" not in param["annotation"]["snpeff"]: 2894 param["annotation"]["snpeff"]["options"] = "" 2895 2896 # snpEff options in annotations 2897 param["annotation"]["snpeff"]["options"] = "".join( 2898 annotation_file.split(":")[1:] 2899 ) 2900 2901 # Annotation Annovar 2902 elif annotation_file.startswith("annovar"): 2903 2904 log.debug(f"Quick Annotation Annovar") 2905 2906 if "annovar" not in param["annotation"]: 2907 param["annotation"]["annovar"] = {} 2908 2909 if "annotations" not in param["annotation"]["annovar"]: 2910 param["annotation"]["annovar"]["annotations"] = {} 2911 2912 # Options 2913 annotation_file_split = annotation_file.split(":") 2914 for annotation_file_annotation in annotation_file_split[1:]: 2915 if annotation_file_annotation: 2916 param["annotation"]["annovar"]["annotations"][ 2917 annotation_file_annotation 2918 ] = annotations 2919 2920 # Annotation Exomiser 2921 elif annotation_file.startswith("exomiser"): 2922 2923 log.debug(f"Quick Annotation Exomiser") 2924 2925 param["annotation"]["exomiser"] = params_string_to_dict( 2926 annotation_file 2927 ) 2928 2929 # Annotation Splice 2930 elif annotation_file.startswith("splice"): 2931 2932 log.debug(f"Quick Annotation Splice") 2933 2934 param["annotation"]["splice"] = params_string_to_dict( 2935 annotation_file 2936 ) 2937 2938 # Annotation Parquet or BCFTOOLS 2939 else: 2940 2941 # Tools detection 2942 if annotation_file.startswith("bcftools:"): 2943 annotation_tool_initial = "bcftools" 2944 annotation_file = ":".join(annotation_file.split(":")[1:]) 2945 elif annotation_file.startswith("snpsift:"): 2946 annotation_tool_initial = "snpsift" 2947 annotation_file = ":".join(annotation_file.split(":")[1:]) 2948 else: 2949 annotation_tool_initial = None 2950 2951 # list of files 2952 annotation_file_list = annotation_file.replace("+", ":").split( 2953 ":" 2954 ) 2955 2956 for annotation_file in annotation_file_list: 2957 2958 if annotation_file: 2959 2960 # Annotation tool initial 2961 annotation_tool = annotation_tool_initial 2962 2963 # Find file 2964 annotation_file_found = None 2965 2966 # Expand user 2967 annotation_file = full_path(annotation_file) 2968 2969 if os.path.exists(annotation_file): 2970 annotation_file_found = annotation_file 2971 2972 else: 2973 # Find within assembly folders 2974 for annotations_database in annotations_databases: 2975 found_files = find_all( 2976 annotation_file, 2977 os.path.join( 2978 annotations_database, assembly 2979 ), 2980 ) 2981 if len(found_files) > 0: 2982 annotation_file_found = found_files[0] 2983 break 2984 if not annotation_file_found and not assembly: 2985 # Find within folders 2986 for ( 2987 annotations_database 2988 ) in annotations_databases: 2989 found_files = find_all( 2990 annotation_file, annotations_database 2991 ) 2992 if len(found_files) > 0: 2993 annotation_file_found = found_files[0] 2994 break 2995 log.debug( 2996 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2997 ) 2998 2999 # Full path 3000 annotation_file_found = full_path(annotation_file_found) 3001 3002 if annotation_file_found: 3003 3004 database = Database(database=annotation_file_found) 3005 quick_annotation_format = database.get_format() 3006 quick_annotation_is_compressed = ( 3007 database.is_compressed() 3008 ) 3009 quick_annotation_is_indexed = os.path.exists( 3010 f"{annotation_file_found}.tbi" 3011 ) 3012 bcftools_preference = False 3013 3014 # Check Annotation Tool 3015 if not annotation_tool: 3016 if ( 3017 bcftools_preference 3018 and quick_annotation_format 3019 in ["vcf", "bed"] 3020 and quick_annotation_is_compressed 3021 and quick_annotation_is_indexed 3022 ): 3023 annotation_tool = "bcftools" 3024 elif quick_annotation_format in [ 3025 "vcf", 3026 "bed", 3027 "tsv", 3028 "tsv", 3029 "csv", 3030 "json", 3031 "tbl", 3032 "parquet", 3033 "duckdb", 3034 ]: 3035 annotation_tool = "parquet" 3036 else: 3037 log.error( 3038 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3039 ) 3040 raise ValueError( 3041 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3042 ) 3043 3044 log.debug( 3045 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3046 ) 3047 3048 # Annotation Tool dispatch 3049 if annotation_tool: 3050 if annotation_tool not in param["annotation"]: 3051 param["annotation"][annotation_tool] = {} 3052 if ( 3053 "annotations" 3054 not in param["annotation"][annotation_tool] 3055 ): 3056 param["annotation"][annotation_tool][ 3057 "annotations" 3058 ] = {} 3059 param["annotation"][annotation_tool][ 3060 "annotations" 3061 ][annotation_file_found] = annotations 3062 3063 else: 3064 log.error( 3065 f"Quick Annotation File {annotation_file} does NOT exist" 3066 ) 3067 3068 self.set_param(param) 3069 3070 if param.get("annotation", None): 3071 log.info("Annotations") 3072 if param.get("annotation", {}).get("parquet", None): 3073 log.info("Annotations 'parquet'...") 3074 self.annotation_parquet() 3075 if param.get("annotation", {}).get("bcftools", None): 3076 log.info("Annotations 'bcftools'...") 3077 self.annotation_bcftools() 3078 if param.get("annotation", {}).get("snpsift", None): 3079 log.info("Annotations 'snpsift'...") 3080 self.annotation_snpsift() 3081 if param.get("annotation", {}).get("annovar", None): 3082 log.info("Annotations 'annovar'...") 3083 self.annotation_annovar() 3084 if param.get("annotation", {}).get("snpeff", None): 3085 log.info("Annotations 'snpeff'...") 3086 self.annotation_snpeff() 3087 if param.get("annotation", {}).get("exomiser", None) is not None: 3088 log.info("Annotations 'exomiser'...") 3089 self.annotation_exomiser() 3090 if param.get("annotation", {}).get("splice", None) is not None: 3091 log.info("Annotations 'splice' ...") 3092 self.annotation_splice() 3093 3094 # Explode INFOS fields into table fields 3095 if self.get_explode_infos(): 3096 self.explode_infos( 3097 prefix=self.get_explode_infos_prefix(), 3098 fields=self.get_explode_infos_fields(), 3099 force=True, 3100 )
It annotates the VCF file with the annotations specified in the config file.
3102 def annotation_snpsift(self, threads: int = None) -> None: 3103 """ 3104 This function annotate with bcftools 3105 3106 :param threads: Number of threads to use 3107 :return: the value of the variable "return_value". 3108 """ 3109 3110 # DEBUG 3111 log.debug("Start annotation with bcftools databases") 3112 3113 # Threads 3114 if not threads: 3115 threads = self.get_threads() 3116 log.debug("Threads: " + str(threads)) 3117 3118 # Config 3119 config = self.get_config() 3120 log.debug("Config: " + str(config)) 3121 3122 # Config - snpSift 3123 snpsift_bin_command = get_bin_command( 3124 bin="SnpSift.jar", 3125 tool="snpsift", 3126 bin_type="jar", 3127 config=config, 3128 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3129 ) 3130 if not snpsift_bin_command: 3131 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3132 log.error(msg_err) 3133 raise ValueError(msg_err) 3134 3135 # Config - bcftools 3136 bcftools_bin_command = get_bin_command( 3137 bin="bcftools", 3138 tool="bcftools", 3139 bin_type="bin", 3140 config=config, 3141 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3142 ) 3143 if not bcftools_bin_command: 3144 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3145 log.error(msg_err) 3146 raise ValueError(msg_err) 3147 3148 # Config - BCFTools databases folders 3149 databases_folders = set( 3150 self.get_config() 3151 .get("folders", {}) 3152 .get("databases", {}) 3153 .get("annotations", ["."]) 3154 + self.get_config() 3155 .get("folders", {}) 3156 .get("databases", {}) 3157 .get("bcftools", ["."]) 3158 ) 3159 log.debug("Databases annotations: " + str(databases_folders)) 3160 3161 # Param 3162 annotations = ( 3163 self.get_param() 3164 .get("annotation", {}) 3165 .get("snpsift", {}) 3166 .get("annotations", None) 3167 ) 3168 log.debug("Annotations: " + str(annotations)) 3169 3170 # Assembly 3171 assembly = self.get_param().get( 3172 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3173 ) 3174 3175 # Data 3176 table_variants = self.get_table_variants() 3177 3178 # Check if not empty 3179 log.debug("Check if not empty") 3180 sql_query_chromosomes = ( 3181 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3182 ) 3183 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3184 if not sql_query_chromosomes_df["count"][0]: 3185 log.info(f"VCF empty") 3186 return 3187 3188 # VCF header 3189 vcf_reader = self.get_header() 3190 log.debug("Initial header: " + str(vcf_reader.infos)) 3191 3192 # Existing annotations 3193 for vcf_annotation in self.get_header().infos: 3194 3195 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3196 log.debug( 3197 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3198 ) 3199 3200 if annotations: 3201 3202 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3203 3204 # Export VCF file 3205 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3206 3207 # Init 3208 commands = {} 3209 3210 for annotation in annotations: 3211 annotation_fields = annotations[annotation] 3212 3213 # Annotation Name 3214 annotation_name = os.path.basename(annotation) 3215 3216 if not annotation_fields: 3217 annotation_fields = {"INFO": None} 3218 3219 log.debug(f"Annotation '{annotation_name}'") 3220 log.debug( 3221 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3222 ) 3223 3224 # Create Database 3225 database = Database( 3226 database=annotation, 3227 databases_folders=databases_folders, 3228 assembly=assembly, 3229 ) 3230 3231 # Find files 3232 db_file = database.get_database() 3233 db_file = full_path(db_file) 3234 db_hdr_file = database.get_header_file() 3235 db_hdr_file = full_path(db_hdr_file) 3236 db_file_type = database.get_format() 3237 db_tbi_file = f"{db_file}.tbi" 3238 db_file_compressed = database.is_compressed() 3239 3240 # Check if compressed 3241 if not db_file_compressed: 3242 log.error( 3243 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3244 ) 3245 raise ValueError( 3246 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3247 ) 3248 3249 # Check if indexed 3250 if not os.path.exists(db_tbi_file): 3251 log.error( 3252 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3253 ) 3254 raise ValueError( 3255 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3256 ) 3257 3258 # Check index - try to create if not exists 3259 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3260 log.error("Annotation failed: database not valid") 3261 log.error(f"Annotation annotation file: {db_file}") 3262 log.error(f"Annotation annotation header: {db_hdr_file}") 3263 log.error(f"Annotation annotation index: {db_tbi_file}") 3264 raise ValueError( 3265 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3266 ) 3267 else: 3268 3269 log.debug( 3270 f"Annotation '{annotation}' - file: " 3271 + str(db_file) 3272 + " and " 3273 + str(db_hdr_file) 3274 ) 3275 3276 # Load header as VCF object 3277 db_hdr_vcf = Variants(input=db_hdr_file) 3278 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3279 log.debug( 3280 "Annotation database header: " 3281 + str(db_hdr_vcf_header_infos) 3282 ) 3283 3284 # For all fields in database 3285 annotation_fields_full = False 3286 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3287 annotation_fields = { 3288 key: key for key in db_hdr_vcf_header_infos 3289 } 3290 log.debug( 3291 "Annotation database header - All annotations added: " 3292 + str(annotation_fields) 3293 ) 3294 annotation_fields_full = True 3295 3296 # # Create file for field rename 3297 # log.debug("Create file for field rename") 3298 # tmp_rename = NamedTemporaryFile( 3299 # prefix=self.get_prefix(), 3300 # dir=self.get_tmp_dir(), 3301 # suffix=".rename", 3302 # delete=False, 3303 # ) 3304 # tmp_rename_name = tmp_rename.name 3305 # tmp_files.append(tmp_rename_name) 3306 3307 # Number of fields 3308 nb_annotation_field = 0 3309 annotation_list = [] 3310 annotation_infos_rename_list = [] 3311 3312 for annotation_field in annotation_fields: 3313 3314 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3315 annotation_fields_new_name = annotation_fields.get( 3316 annotation_field, annotation_field 3317 ) 3318 if not annotation_fields_new_name: 3319 annotation_fields_new_name = annotation_field 3320 3321 # Check if field is in DB and if field is not elready in input data 3322 if ( 3323 annotation_field in db_hdr_vcf.get_header().infos 3324 and annotation_fields_new_name 3325 not in self.get_header().infos 3326 ): 3327 3328 log.info( 3329 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3330 ) 3331 3332 # BCFTools annotate param to rename fields 3333 if annotation_field != annotation_fields_new_name: 3334 annotation_infos_rename_list.append( 3335 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3336 ) 3337 3338 # Add INFO field to header 3339 db_hdr_vcf_header_infos_number = ( 3340 db_hdr_vcf_header_infos[annotation_field].num or "." 3341 ) 3342 db_hdr_vcf_header_infos_type = ( 3343 db_hdr_vcf_header_infos[annotation_field].type 3344 or "String" 3345 ) 3346 db_hdr_vcf_header_infos_description = ( 3347 db_hdr_vcf_header_infos[annotation_field].desc 3348 or f"{annotation_field} description" 3349 ) 3350 db_hdr_vcf_header_infos_source = ( 3351 db_hdr_vcf_header_infos[annotation_field].source 3352 or "unknown" 3353 ) 3354 db_hdr_vcf_header_infos_version = ( 3355 db_hdr_vcf_header_infos[annotation_field].version 3356 or "unknown" 3357 ) 3358 3359 vcf_reader.infos[annotation_fields_new_name] = ( 3360 vcf.parser._Info( 3361 annotation_fields_new_name, 3362 db_hdr_vcf_header_infos_number, 3363 db_hdr_vcf_header_infos_type, 3364 db_hdr_vcf_header_infos_description, 3365 db_hdr_vcf_header_infos_source, 3366 db_hdr_vcf_header_infos_version, 3367 self.code_type_map[ 3368 db_hdr_vcf_header_infos_type 3369 ], 3370 ) 3371 ) 3372 3373 annotation_list.append(annotation_field) 3374 3375 nb_annotation_field += 1 3376 3377 else: 3378 3379 if ( 3380 annotation_field 3381 not in db_hdr_vcf.get_header().infos 3382 ): 3383 log.warning( 3384 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3385 ) 3386 if ( 3387 annotation_fields_new_name 3388 in self.get_header().infos 3389 ): 3390 log.warning( 3391 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3392 ) 3393 3394 log.info( 3395 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3396 ) 3397 3398 annotation_infos = ",".join(annotation_list) 3399 3400 if annotation_infos != "": 3401 3402 # Annotated VCF (and error file) 3403 tmp_annotation_vcf_name = os.path.join( 3404 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3405 ) 3406 tmp_annotation_vcf_name_err = ( 3407 tmp_annotation_vcf_name + ".err" 3408 ) 3409 3410 # Add fields to annotate 3411 if not annotation_fields_full: 3412 annotation_infos_option = f"-info {annotation_infos}" 3413 else: 3414 annotation_infos_option = "" 3415 3416 # Info fields rename 3417 if annotation_infos_rename_list: 3418 annotation_infos_rename = " -c " + ",".join( 3419 annotation_infos_rename_list 3420 ) 3421 else: 3422 annotation_infos_rename = "" 3423 3424 # Annotate command 3425 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3426 3427 # Add command 3428 commands[command_annotate] = tmp_annotation_vcf_name 3429 3430 if commands: 3431 3432 # Export VCF file 3433 self.export_variant_vcf( 3434 vcf_file=tmp_vcf_name, 3435 remove_info=True, 3436 add_samples=False, 3437 index=True, 3438 ) 3439 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3440 3441 # Num command 3442 nb_command = 0 3443 3444 # Annotate 3445 for command_annotate in commands: 3446 nb_command += 1 3447 log.info( 3448 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3449 ) 3450 log.debug(f"command_annotate={command_annotate}") 3451 run_parallel_commands([command_annotate], threads) 3452 3453 # Debug 3454 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3455 3456 # Update variants 3457 log.info( 3458 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3459 ) 3460 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3462 def annotation_bcftools(self, threads: int = None) -> None: 3463 """ 3464 This function annotate with bcftools 3465 3466 :param threads: Number of threads to use 3467 :return: the value of the variable "return_value". 3468 """ 3469 3470 # DEBUG 3471 log.debug("Start annotation with bcftools databases") 3472 3473 # Threads 3474 if not threads: 3475 threads = self.get_threads() 3476 log.debug("Threads: " + str(threads)) 3477 3478 # Config 3479 config = self.get_config() 3480 log.debug("Config: " + str(config)) 3481 3482 # DEBUG 3483 delete_tmp = True 3484 if self.get_config().get("verbosity", "warning") in ["debug"]: 3485 delete_tmp = False 3486 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3487 3488 # Config - BCFTools bin command 3489 bcftools_bin_command = get_bin_command( 3490 bin="bcftools", 3491 tool="bcftools", 3492 bin_type="bin", 3493 config=config, 3494 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3495 ) 3496 if not bcftools_bin_command: 3497 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3498 log.error(msg_err) 3499 raise ValueError(msg_err) 3500 3501 # Config - BCFTools databases folders 3502 databases_folders = set( 3503 self.get_config() 3504 .get("folders", {}) 3505 .get("databases", {}) 3506 .get("annotations", ["."]) 3507 + self.get_config() 3508 .get("folders", {}) 3509 .get("databases", {}) 3510 .get("bcftools", ["."]) 3511 ) 3512 log.debug("Databases annotations: " + str(databases_folders)) 3513 3514 # Param 3515 annotations = ( 3516 self.get_param() 3517 .get("annotation", {}) 3518 .get("bcftools", {}) 3519 .get("annotations", None) 3520 ) 3521 log.debug("Annotations: " + str(annotations)) 3522 3523 # Assembly 3524 assembly = self.get_param().get( 3525 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3526 ) 3527 3528 # Data 3529 table_variants = self.get_table_variants() 3530 3531 # Check if not empty 3532 log.debug("Check if not empty") 3533 sql_query_chromosomes = ( 3534 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3535 ) 3536 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3537 if not sql_query_chromosomes_df["count"][0]: 3538 log.info(f"VCF empty") 3539 return 3540 3541 # Export in VCF 3542 log.debug("Create initial file to annotate") 3543 tmp_vcf = NamedTemporaryFile( 3544 prefix=self.get_prefix(), 3545 dir=self.get_tmp_dir(), 3546 suffix=".vcf.gz", 3547 delete=False, 3548 ) 3549 tmp_vcf_name = tmp_vcf.name 3550 3551 # VCF header 3552 vcf_reader = self.get_header() 3553 log.debug("Initial header: " + str(vcf_reader.infos)) 3554 3555 # Existing annotations 3556 for vcf_annotation in self.get_header().infos: 3557 3558 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3559 log.debug( 3560 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3561 ) 3562 3563 if annotations: 3564 3565 tmp_ann_vcf_list = [] 3566 commands = [] 3567 tmp_files = [] 3568 err_files = [] 3569 3570 for annotation in annotations: 3571 annotation_fields = annotations[annotation] 3572 3573 # Annotation Name 3574 annotation_name = os.path.basename(annotation) 3575 3576 if not annotation_fields: 3577 annotation_fields = {"INFO": None} 3578 3579 log.debug(f"Annotation '{annotation_name}'") 3580 log.debug( 3581 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3582 ) 3583 3584 # Create Database 3585 database = Database( 3586 database=annotation, 3587 databases_folders=databases_folders, 3588 assembly=assembly, 3589 ) 3590 3591 # Find files 3592 db_file = database.get_database() 3593 db_file = full_path(db_file) 3594 db_hdr_file = database.get_header_file() 3595 db_hdr_file = full_path(db_hdr_file) 3596 db_file_type = database.get_format() 3597 db_tbi_file = f"{db_file}.tbi" 3598 db_file_compressed = database.is_compressed() 3599 3600 # Check if compressed 3601 if not db_file_compressed: 3602 log.error( 3603 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3604 ) 3605 raise ValueError( 3606 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3607 ) 3608 3609 # Check if indexed 3610 if not os.path.exists(db_tbi_file): 3611 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3612 raise ValueError( 3613 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3614 ) 3615 3616 # Check index - try to create if not exists 3617 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3618 log.error("Annotation failed: database not valid") 3619 log.error(f"Annotation annotation file: {db_file}") 3620 log.error(f"Annotation annotation header: {db_hdr_file}") 3621 log.error(f"Annotation annotation index: {db_tbi_file}") 3622 raise ValueError( 3623 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3624 ) 3625 else: 3626 3627 log.debug( 3628 f"Annotation '{annotation}' - file: " 3629 + str(db_file) 3630 + " and " 3631 + str(db_hdr_file) 3632 ) 3633 3634 # Load header as VCF object 3635 db_hdr_vcf = Variants(input=db_hdr_file) 3636 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3637 log.debug( 3638 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3639 ) 3640 3641 # For all fields in database 3642 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3643 annotation_fields = { 3644 key: key for key in db_hdr_vcf_header_infos 3645 } 3646 log.debug( 3647 "Annotation database header - All annotations added: " 3648 + str(annotation_fields) 3649 ) 3650 3651 # Number of fields 3652 nb_annotation_field = 0 3653 annotation_list = [] 3654 3655 for annotation_field in annotation_fields: 3656 3657 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3658 annotation_fields_new_name = annotation_fields.get( 3659 annotation_field, annotation_field 3660 ) 3661 if not annotation_fields_new_name: 3662 annotation_fields_new_name = annotation_field 3663 3664 # Check if field is in DB and if field is not elready in input data 3665 if ( 3666 annotation_field in db_hdr_vcf.get_header().infos 3667 and annotation_fields_new_name 3668 not in self.get_header().infos 3669 ): 3670 3671 log.info( 3672 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3673 ) 3674 3675 # Add INFO field to header 3676 db_hdr_vcf_header_infos_number = ( 3677 db_hdr_vcf_header_infos[annotation_field].num or "." 3678 ) 3679 db_hdr_vcf_header_infos_type = ( 3680 db_hdr_vcf_header_infos[annotation_field].type 3681 or "String" 3682 ) 3683 db_hdr_vcf_header_infos_description = ( 3684 db_hdr_vcf_header_infos[annotation_field].desc 3685 or f"{annotation_field} description" 3686 ) 3687 db_hdr_vcf_header_infos_source = ( 3688 db_hdr_vcf_header_infos[annotation_field].source 3689 or "unknown" 3690 ) 3691 db_hdr_vcf_header_infos_version = ( 3692 db_hdr_vcf_header_infos[annotation_field].version 3693 or "unknown" 3694 ) 3695 3696 vcf_reader.infos[annotation_fields_new_name] = ( 3697 vcf.parser._Info( 3698 annotation_fields_new_name, 3699 db_hdr_vcf_header_infos_number, 3700 db_hdr_vcf_header_infos_type, 3701 db_hdr_vcf_header_infos_description, 3702 db_hdr_vcf_header_infos_source, 3703 db_hdr_vcf_header_infos_version, 3704 self.code_type_map[db_hdr_vcf_header_infos_type], 3705 ) 3706 ) 3707 3708 # annotation_list.append(annotation_field) 3709 if annotation_field != annotation_fields_new_name: 3710 annotation_list.append( 3711 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3712 ) 3713 else: 3714 annotation_list.append(annotation_field) 3715 3716 nb_annotation_field += 1 3717 3718 else: 3719 3720 if annotation_field not in db_hdr_vcf.get_header().infos: 3721 log.warning( 3722 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3723 ) 3724 if annotation_fields_new_name in self.get_header().infos: 3725 log.warning( 3726 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3727 ) 3728 3729 log.info( 3730 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3731 ) 3732 3733 annotation_infos = ",".join(annotation_list) 3734 3735 if annotation_infos != "": 3736 3737 # Protect header for bcftools (remove "#CHROM" and variants line) 3738 log.debug("Protect Header file - remove #CHROM line if exists") 3739 tmp_header_vcf = NamedTemporaryFile( 3740 prefix=self.get_prefix(), 3741 dir=self.get_tmp_dir(), 3742 suffix=".hdr", 3743 delete=False, 3744 ) 3745 tmp_header_vcf_name = tmp_header_vcf.name 3746 tmp_files.append(tmp_header_vcf_name) 3747 # Command 3748 if db_hdr_file.endswith(".gz"): 3749 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3750 else: 3751 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3752 # Run 3753 run_parallel_commands([command_extract_header], 1) 3754 3755 # Find chomosomes 3756 log.debug("Find chromosomes ") 3757 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3758 sql_query_chromosomes_df = self.get_query_to_df( 3759 sql_query_chromosomes 3760 ) 3761 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3762 3763 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3764 3765 # BED columns in the annotation file 3766 if db_file_type in ["bed"]: 3767 annotation_infos = "CHROM,POS,POS," + annotation_infos 3768 3769 for chrom in chomosomes_list: 3770 3771 # Create BED on initial VCF 3772 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3773 tmp_bed = NamedTemporaryFile( 3774 prefix=self.get_prefix(), 3775 dir=self.get_tmp_dir(), 3776 suffix=".bed", 3777 delete=False, 3778 ) 3779 tmp_bed_name = tmp_bed.name 3780 tmp_files.append(tmp_bed_name) 3781 3782 # Detecte regions 3783 log.debug( 3784 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3785 ) 3786 window = 1000000 3787 sql_query_intervals_for_bed = f""" 3788 SELECT \"#CHROM\", 3789 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3790 \"POS\"+{window} 3791 FROM {table_variants} as table_variants 3792 WHERE table_variants.\"#CHROM\" = '{chrom}' 3793 """ 3794 regions = self.conn.execute( 3795 sql_query_intervals_for_bed 3796 ).fetchall() 3797 merged_regions = merge_regions(regions) 3798 log.debug( 3799 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3800 ) 3801 3802 header = ["#CHROM", "START", "END"] 3803 with open(tmp_bed_name, "w") as f: 3804 # Write the header with tab delimiter 3805 f.write("\t".join(header) + "\n") 3806 for d in merged_regions: 3807 # Write each data row with tab delimiter 3808 f.write("\t".join(map(str, d)) + "\n") 3809 3810 # Tmp files 3811 tmp_annotation_vcf = NamedTemporaryFile( 3812 prefix=self.get_prefix(), 3813 dir=self.get_tmp_dir(), 3814 suffix=".vcf.gz", 3815 delete=False, 3816 ) 3817 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3818 tmp_files.append(tmp_annotation_vcf_name) 3819 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3820 tmp_annotation_vcf_name_err = ( 3821 tmp_annotation_vcf_name + ".err" 3822 ) 3823 err_files.append(tmp_annotation_vcf_name_err) 3824 3825 # Annotate Command 3826 log.debug( 3827 f"Annotation '{annotation}' - add bcftools command" 3828 ) 3829 3830 # Command 3831 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3832 3833 # Add command 3834 commands.append(command_annotate) 3835 3836 # if some commands 3837 if commands: 3838 3839 # Export VCF file 3840 self.export_variant_vcf( 3841 vcf_file=tmp_vcf_name, 3842 remove_info=True, 3843 add_samples=False, 3844 index=True, 3845 ) 3846 3847 # Threads 3848 # calculate threads for annotated commands 3849 if commands: 3850 threads_bcftools_annotate = round(threads / len(commands)) 3851 else: 3852 threads_bcftools_annotate = 1 3853 3854 if not threads_bcftools_annotate: 3855 threads_bcftools_annotate = 1 3856 3857 # Add threads option to bcftools commands 3858 if threads_bcftools_annotate > 1: 3859 commands_threaded = [] 3860 for command in commands: 3861 commands_threaded.append( 3862 command.replace( 3863 f"{bcftools_bin_command} annotate ", 3864 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3865 ) 3866 ) 3867 commands = commands_threaded 3868 3869 # Command annotation multithreading 3870 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3871 log.info( 3872 f"Annotation - Annotation multithreaded in " 3873 + str(len(commands)) 3874 + " commands" 3875 ) 3876 3877 run_parallel_commands(commands, threads) 3878 3879 # Merge 3880 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3881 3882 if tmp_ann_vcf_list_cmd: 3883 3884 # Tmp file 3885 tmp_annotate_vcf = NamedTemporaryFile( 3886 prefix=self.get_prefix(), 3887 dir=self.get_tmp_dir(), 3888 suffix=".vcf.gz", 3889 delete=True, 3890 ) 3891 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3892 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3893 err_files.append(tmp_annotate_vcf_name_err) 3894 3895 # Tmp file remove command 3896 tmp_files_remove_command = "" 3897 if tmp_files: 3898 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3899 3900 # Command merge 3901 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3902 log.info( 3903 f"Annotation - Annotation merging " 3904 + str(len(commands)) 3905 + " annotated files" 3906 ) 3907 log.debug(f"Annotation - merge command: {merge_command}") 3908 run_parallel_commands([merge_command], 1) 3909 3910 # Error messages 3911 log.info(f"Error/Warning messages:") 3912 error_message_command_all = [] 3913 error_message_command_warning = [] 3914 error_message_command_err = [] 3915 for err_file in err_files: 3916 with open(err_file, "r") as f: 3917 for line in f: 3918 message = line.strip() 3919 error_message_command_all.append(message) 3920 if line.startswith("[W::"): 3921 error_message_command_warning.append(message) 3922 if line.startswith("[E::"): 3923 error_message_command_err.append( 3924 f"{err_file}: " + message 3925 ) 3926 # log info 3927 for message in list( 3928 set(error_message_command_err + error_message_command_warning) 3929 ): 3930 log.info(f" {message}") 3931 # debug info 3932 for message in list(set(error_message_command_all)): 3933 log.debug(f" {message}") 3934 # failed 3935 if len(error_message_command_err): 3936 log.error("Annotation failed: Error in commands") 3937 raise ValueError("Annotation failed: Error in commands") 3938 3939 # Update variants 3940 log.info(f"Annotation - Updating...") 3941 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3943 def annotation_exomiser(self, threads: int = None) -> None: 3944 """ 3945 This function annotate with Exomiser 3946 3947 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3948 - "analysis" (dict/file): 3949 Full analysis dictionnary parameters (see Exomiser docs). 3950 Either a dict, or a file in JSON or YAML format. 3951 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3952 Default : None 3953 - "preset" (string): 3954 Analysis preset (available in config folder). 3955 Used if no full "analysis" is provided. 3956 Default: "exome" 3957 - "phenopacket" (dict/file): 3958 Samples and phenotipic features parameters (see Exomiser docs). 3959 Either a dict, or a file in JSON or YAML format. 3960 Default: None 3961 - "subject" (dict): 3962 Sample parameters (see Exomiser docs). 3963 Example: 3964 "subject": 3965 { 3966 "id": "ISDBM322017", 3967 "sex": "FEMALE" 3968 } 3969 Default: None 3970 - "sample" (string): 3971 Sample name to construct "subject" section: 3972 "subject": 3973 { 3974 "id": "<sample>", 3975 "sex": "UNKNOWN_SEX" 3976 } 3977 Default: None 3978 - "phenotypicFeatures" (dict) 3979 Phenotypic features to construct "subject" section. 3980 Example: 3981 "phenotypicFeatures": 3982 [ 3983 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3984 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3985 ] 3986 - "hpo" (list) 3987 List of HPO ids as phenotypic features. 3988 Example: 3989 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3990 Default: [] 3991 - "outputOptions" (dict): 3992 Output options (see Exomiser docs). 3993 Default: 3994 "output_options" = 3995 { 3996 "outputContributingVariantsOnly": False, 3997 "numGenes": 0, 3998 "outputFormats": ["TSV_VARIANT", "VCF"] 3999 } 4000 - "transcript_source" (string): 4001 Transcript source (either "refseq", "ucsc", "ensembl") 4002 Default: "refseq" 4003 - "exomiser_to_info" (boolean): 4004 Add exomiser TSV file columns as INFO fields in VCF. 4005 Default: False 4006 - "release" (string): 4007 Exomise database release. 4008 If not exists, database release will be downloaded (take a while). 4009 Default: None (provided by application.properties configuration file) 4010 - "exomiser_application_properties" (file): 4011 Exomiser configuration file (see Exomiser docs). 4012 Useful to automatically download databases (especially for specific genome databases). 4013 4014 Notes: 4015 - If no sample in parameters, first sample in VCF will be chosen 4016 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4017 4018 :param threads: The number of threads to use 4019 :return: None. 4020 """ 4021 4022 # DEBUG 4023 log.debug("Start annotation with Exomiser databases") 4024 4025 # Threads 4026 if not threads: 4027 threads = self.get_threads() 4028 log.debug("Threads: " + str(threads)) 4029 4030 # Config 4031 config = self.get_config() 4032 log.debug("Config: " + str(config)) 4033 4034 # Config - Folders - Databases 4035 databases_folders = ( 4036 config.get("folders", {}) 4037 .get("databases", {}) 4038 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4039 ) 4040 databases_folders = full_path(databases_folders) 4041 if not os.path.exists(databases_folders): 4042 log.error(f"Databases annotations: {databases_folders} NOT found") 4043 log.debug("Databases annotations: " + str(databases_folders)) 4044 4045 # Config - Exomiser 4046 exomiser_bin_command = get_bin_command( 4047 bin="exomiser-cli*.jar", 4048 tool="exomiser", 4049 bin_type="jar", 4050 config=config, 4051 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4052 ) 4053 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4054 if not exomiser_bin_command: 4055 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4056 log.error(msg_err) 4057 raise ValueError(msg_err) 4058 4059 # Param 4060 param = self.get_param() 4061 log.debug("Param: " + str(param)) 4062 4063 # Param - Exomiser 4064 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4065 log.debug(f"Param Exomiser: {param_exomiser}") 4066 4067 # Param - Assembly 4068 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4069 log.debug("Assembly: " + str(assembly)) 4070 4071 # Data 4072 table_variants = self.get_table_variants() 4073 4074 # Check if not empty 4075 log.debug("Check if not empty") 4076 sql_query_chromosomes = ( 4077 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4078 ) 4079 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4080 log.info(f"VCF empty") 4081 return False 4082 4083 # VCF header 4084 vcf_reader = self.get_header() 4085 log.debug("Initial header: " + str(vcf_reader.infos)) 4086 4087 # Samples 4088 samples = self.get_header_sample_list() 4089 if not samples: 4090 log.error("No Samples in VCF") 4091 return False 4092 log.debug(f"Samples: {samples}") 4093 4094 # Memory limit 4095 memory_limit = self.get_memory("8G") 4096 log.debug(f"memory_limit: {memory_limit}") 4097 4098 # Exomiser java options 4099 exomiser_java_options = ( 4100 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4101 ) 4102 log.debug(f"Exomiser java options: {exomiser_java_options}") 4103 4104 # Download Exomiser (if not exists) 4105 exomiser_release = param_exomiser.get("release", None) 4106 exomiser_application_properties = param_exomiser.get( 4107 "exomiser_application_properties", None 4108 ) 4109 databases_download_exomiser( 4110 assemblies=[assembly], 4111 exomiser_folder=databases_folders, 4112 exomiser_release=exomiser_release, 4113 exomiser_phenotype_release=exomiser_release, 4114 exomiser_application_properties=exomiser_application_properties, 4115 ) 4116 4117 # Force annotation 4118 force_update_annotation = True 4119 4120 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4121 log.debug("Start annotation Exomiser") 4122 4123 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4124 4125 # tmp_dir = "/tmp/exomiser" 4126 4127 ### ANALYSIS ### 4128 ################ 4129 4130 # Create analysis.json through analysis dict 4131 # either analysis in param or by default 4132 # depending on preset exome/genome) 4133 4134 # Init analysis dict 4135 param_exomiser_analysis_dict = {} 4136 4137 # analysis from param 4138 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4139 param_exomiser_analysis = full_path(param_exomiser_analysis) 4140 4141 # If analysis in param -> load anlaysis json 4142 if param_exomiser_analysis: 4143 4144 # If param analysis is a file and exists 4145 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4146 param_exomiser_analysis 4147 ): 4148 # Load analysis file into analysis dict (either yaml or json) 4149 with open(param_exomiser_analysis) as json_file: 4150 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4151 4152 # If param analysis is a dict 4153 elif isinstance(param_exomiser_analysis, dict): 4154 # Load analysis dict into analysis dict (either yaml or json) 4155 param_exomiser_analysis_dict = param_exomiser_analysis 4156 4157 # Error analysis type 4158 else: 4159 log.error(f"Analysis type unknown. Check param file.") 4160 raise ValueError(f"Analysis type unknown. Check param file.") 4161 4162 # Case no input analysis config file/dict 4163 # Use preset (exome/genome) to open default config file 4164 if not param_exomiser_analysis_dict: 4165 4166 # default preset 4167 default_preset = "exome" 4168 4169 # Get param preset or default preset 4170 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4171 4172 # Try to find if preset is a file 4173 if os.path.exists(param_exomiser_preset): 4174 # Preset file is provided in full path 4175 param_exomiser_analysis_default_config_file = ( 4176 param_exomiser_preset 4177 ) 4178 # elif os.path.exists(full_path(param_exomiser_preset)): 4179 # # Preset file is provided in full path 4180 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4181 elif os.path.exists( 4182 os.path.join(folder_config, param_exomiser_preset) 4183 ): 4184 # Preset file is provided a basename in config folder (can be a path with subfolders) 4185 param_exomiser_analysis_default_config_file = os.path.join( 4186 folder_config, param_exomiser_preset 4187 ) 4188 else: 4189 # Construct preset file 4190 param_exomiser_analysis_default_config_file = os.path.join( 4191 folder_config, 4192 f"preset-{param_exomiser_preset}-analysis.json", 4193 ) 4194 4195 # If preset file exists 4196 param_exomiser_analysis_default_config_file = full_path( 4197 param_exomiser_analysis_default_config_file 4198 ) 4199 if os.path.exists(param_exomiser_analysis_default_config_file): 4200 # Load prest file into analysis dict (either yaml or json) 4201 with open( 4202 param_exomiser_analysis_default_config_file 4203 ) as json_file: 4204 # param_exomiser_analysis_dict[""] = json.load(json_file) 4205 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4206 json_file 4207 ) 4208 4209 # Error preset file 4210 else: 4211 log.error( 4212 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4213 ) 4214 raise ValueError( 4215 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4216 ) 4217 4218 # If no analysis dict created 4219 if not param_exomiser_analysis_dict: 4220 log.error(f"No analysis config") 4221 raise ValueError(f"No analysis config") 4222 4223 # Log 4224 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4225 4226 ### PHENOPACKET ### 4227 ################### 4228 4229 # If no PhenoPacket in analysis dict -> check in param 4230 if "phenopacket" not in param_exomiser_analysis_dict: 4231 4232 # If PhenoPacket in param -> load anlaysis json 4233 if param_exomiser.get("phenopacket", None): 4234 4235 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4236 param_exomiser_phenopacket = full_path( 4237 param_exomiser_phenopacket 4238 ) 4239 4240 # If param phenopacket is a file and exists 4241 if isinstance( 4242 param_exomiser_phenopacket, str 4243 ) and os.path.exists(param_exomiser_phenopacket): 4244 # Load phenopacket file into analysis dict (either yaml or json) 4245 with open(param_exomiser_phenopacket) as json_file: 4246 param_exomiser_analysis_dict["phenopacket"] = ( 4247 yaml.safe_load(json_file) 4248 ) 4249 4250 # If param phenopacket is a dict 4251 elif isinstance(param_exomiser_phenopacket, dict): 4252 # Load phenopacket dict into analysis dict (either yaml or json) 4253 param_exomiser_analysis_dict["phenopacket"] = ( 4254 param_exomiser_phenopacket 4255 ) 4256 4257 # Error phenopacket type 4258 else: 4259 log.error(f"Phenopacket type unknown. Check param file.") 4260 raise ValueError( 4261 f"Phenopacket type unknown. Check param file." 4262 ) 4263 4264 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4265 if "phenopacket" not in param_exomiser_analysis_dict: 4266 4267 # Init PhenoPacket 4268 param_exomiser_analysis_dict["phenopacket"] = { 4269 "id": "analysis", 4270 "proband": {}, 4271 } 4272 4273 ### Add subject ### 4274 4275 # If subject exists 4276 param_exomiser_subject = param_exomiser.get("subject", {}) 4277 4278 # If subject not exists -> found sample ID 4279 if not param_exomiser_subject: 4280 4281 # Found sample ID in param 4282 sample = param_exomiser.get("sample", None) 4283 4284 # Find sample ID (first sample) 4285 if not sample: 4286 sample_list = self.get_header_sample_list() 4287 if len(sample_list) > 0: 4288 sample = sample_list[0] 4289 else: 4290 log.error(f"No sample found") 4291 raise ValueError(f"No sample found") 4292 4293 # Create subject 4294 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4295 4296 # Add to dict 4297 param_exomiser_analysis_dict["phenopacket"][ 4298 "subject" 4299 ] = param_exomiser_subject 4300 4301 ### Add "phenotypicFeatures" ### 4302 4303 # If phenotypicFeatures exists 4304 param_exomiser_phenotypicfeatures = param_exomiser.get( 4305 "phenotypicFeatures", [] 4306 ) 4307 4308 # If phenotypicFeatures not exists -> Try to infer from hpo list 4309 if not param_exomiser_phenotypicfeatures: 4310 4311 # Found HPO in param 4312 param_exomiser_hpo = param_exomiser.get("hpo", []) 4313 4314 # Split HPO if list in string format separated by comma 4315 if isinstance(param_exomiser_hpo, str): 4316 param_exomiser_hpo = param_exomiser_hpo.split(",") 4317 4318 # Create HPO list 4319 for hpo in param_exomiser_hpo: 4320 hpo_clean = re.sub("[^0-9]", "", hpo) 4321 param_exomiser_phenotypicfeatures.append( 4322 { 4323 "type": { 4324 "id": f"HP:{hpo_clean}", 4325 "label": f"HP:{hpo_clean}", 4326 } 4327 } 4328 ) 4329 4330 # Add to dict 4331 param_exomiser_analysis_dict["phenopacket"][ 4332 "phenotypicFeatures" 4333 ] = param_exomiser_phenotypicfeatures 4334 4335 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4336 if not param_exomiser_phenotypicfeatures: 4337 for step in param_exomiser_analysis_dict.get( 4338 "analysis", {} 4339 ).get("steps", []): 4340 if "hiPhivePrioritiser" in step: 4341 param_exomiser_analysis_dict.get("analysis", {}).get( 4342 "steps", [] 4343 ).remove(step) 4344 4345 ### Add Input File ### 4346 4347 # Initial file name and htsFiles 4348 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4349 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4350 { 4351 "uri": tmp_vcf_name, 4352 "htsFormat": "VCF", 4353 "genomeAssembly": assembly, 4354 } 4355 ] 4356 4357 ### Add metaData ### 4358 4359 # If metaData not in analysis dict 4360 if "metaData" not in param_exomiser_analysis_dict: 4361 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4362 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4363 "createdBy": "howard", 4364 "phenopacketSchemaVersion": 1, 4365 } 4366 4367 ### OutputOptions ### 4368 4369 # Init output result folder 4370 output_results = os.path.join(tmp_dir, "results") 4371 4372 # If no outputOptions in analysis dict 4373 if "outputOptions" not in param_exomiser_analysis_dict: 4374 4375 # default output formats 4376 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4377 4378 # Get outputOptions in param 4379 output_options = param_exomiser.get("outputOptions", None) 4380 4381 # If no output_options in param -> check 4382 if not output_options: 4383 output_options = { 4384 "outputContributingVariantsOnly": False, 4385 "numGenes": 0, 4386 "outputFormats": defaut_output_formats, 4387 } 4388 4389 # Replace outputDirectory in output options 4390 output_options["outputDirectory"] = output_results 4391 output_options["outputFileName"] = "howard" 4392 4393 # Add outputOptions in analysis dict 4394 param_exomiser_analysis_dict["outputOptions"] = output_options 4395 4396 else: 4397 4398 # Replace output_results and output format (if exists in param) 4399 param_exomiser_analysis_dict["outputOptions"][ 4400 "outputDirectory" 4401 ] = output_results 4402 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4403 list( 4404 set( 4405 param_exomiser_analysis_dict.get( 4406 "outputOptions", {} 4407 ).get("outputFormats", []) 4408 + ["TSV_VARIANT", "VCF"] 4409 ) 4410 ) 4411 ) 4412 4413 # log 4414 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4415 4416 ### ANALYSIS FILE ### 4417 ##################### 4418 4419 ### Full JSON analysis config file ### 4420 4421 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4422 with open(exomiser_analysis, "w") as fp: 4423 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4424 4425 ### SPLIT analysis and sample config files 4426 4427 # Splitted analysis dict 4428 param_exomiser_analysis_dict_for_split = ( 4429 param_exomiser_analysis_dict.copy() 4430 ) 4431 4432 # Phenopacket JSON file 4433 exomiser_analysis_phenopacket = os.path.join( 4434 tmp_dir, "analysis_phenopacket.json" 4435 ) 4436 with open(exomiser_analysis_phenopacket, "w") as fp: 4437 json.dump( 4438 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4439 fp, 4440 indent=4, 4441 ) 4442 4443 # Analysis JSON file without Phenopacket parameters 4444 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4445 exomiser_analysis_analysis = os.path.join( 4446 tmp_dir, "analysis_analysis.json" 4447 ) 4448 with open(exomiser_analysis_analysis, "w") as fp: 4449 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4450 4451 ### INITAL VCF file ### 4452 ####################### 4453 4454 ### Create list of samples to use and include inti initial VCF file #### 4455 4456 # Subject (main sample) 4457 # Get sample ID in analysis dict 4458 sample_subject = ( 4459 param_exomiser_analysis_dict.get("phenopacket", {}) 4460 .get("subject", {}) 4461 .get("id", None) 4462 ) 4463 sample_proband = ( 4464 param_exomiser_analysis_dict.get("phenopacket", {}) 4465 .get("proband", {}) 4466 .get("subject", {}) 4467 .get("id", None) 4468 ) 4469 sample = [] 4470 if sample_subject: 4471 sample.append(sample_subject) 4472 if sample_proband: 4473 sample.append(sample_proband) 4474 4475 # Get sample ID within Pedigree 4476 pedigree_persons_list = ( 4477 param_exomiser_analysis_dict.get("phenopacket", {}) 4478 .get("pedigree", {}) 4479 .get("persons", {}) 4480 ) 4481 4482 # Create list with all sample ID in pedigree (if exists) 4483 pedigree_persons = [] 4484 for person in pedigree_persons_list: 4485 pedigree_persons.append(person.get("individualId")) 4486 4487 # Concat subject sample ID and samples ID in pedigreesamples 4488 samples = list(set(sample + pedigree_persons)) 4489 4490 # Check if sample list is not empty 4491 if not samples: 4492 log.error(f"No samples found") 4493 raise ValueError(f"No samples found") 4494 4495 # Create VCF with sample (either sample in param or first one by default) 4496 # Export VCF file 4497 self.export_variant_vcf( 4498 vcf_file=tmp_vcf_name, 4499 remove_info=True, 4500 add_samples=True, 4501 list_samples=samples, 4502 index=False, 4503 ) 4504 4505 ### Execute Exomiser ### 4506 ######################## 4507 4508 # Init command 4509 exomiser_command = "" 4510 4511 # Command exomiser options 4512 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4513 4514 # Release 4515 exomiser_release = param_exomiser.get("release", None) 4516 if exomiser_release: 4517 # phenotype data version 4518 exomiser_options += ( 4519 f" --exomiser.phenotype.data-version={exomiser_release} " 4520 ) 4521 # data version 4522 exomiser_options += ( 4523 f" --exomiser.{assembly}.data-version={exomiser_release} " 4524 ) 4525 # variant white list 4526 variant_white_list_file = ( 4527 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4528 ) 4529 if os.path.exists( 4530 os.path.join( 4531 databases_folders, assembly, variant_white_list_file 4532 ) 4533 ): 4534 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4535 4536 # transcript_source 4537 transcript_source = param_exomiser.get( 4538 "transcript_source", None 4539 ) # ucsc, refseq, ensembl 4540 if transcript_source: 4541 exomiser_options += ( 4542 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4543 ) 4544 4545 # If analysis contain proband param 4546 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4547 "proband", {} 4548 ): 4549 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4550 4551 # If no proband (usually uniq sample) 4552 else: 4553 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4554 4555 # Log 4556 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4557 4558 # Run command 4559 result = subprocess.call( 4560 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4561 ) 4562 if result: 4563 log.error("Exomiser command failed") 4564 raise ValueError("Exomiser command failed") 4565 4566 ### RESULTS ### 4567 ############### 4568 4569 ### Annotate with TSV fields ### 4570 4571 # Init result tsv file 4572 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4573 4574 # Init result tsv file 4575 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4576 4577 # Parse TSV file and explode columns in INFO field 4578 if exomiser_to_info and os.path.exists(output_results_tsv): 4579 4580 # Log 4581 log.debug("Exomiser columns to VCF INFO field") 4582 4583 # Retrieve columns and types 4584 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4585 output_results_tsv_df = self.get_query_to_df(query) 4586 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4587 4588 # Init concat fields for update 4589 sql_query_update_concat_fields = [] 4590 4591 # Fields to avoid 4592 fields_to_avoid = [ 4593 "CONTIG", 4594 "START", 4595 "END", 4596 "REF", 4597 "ALT", 4598 "QUAL", 4599 "FILTER", 4600 "GENOTYPE", 4601 ] 4602 4603 # List all columns to add into header 4604 for header_column in output_results_tsv_columns: 4605 4606 # If header column is enable 4607 if header_column not in fields_to_avoid: 4608 4609 # Header info type 4610 header_info_type = "String" 4611 header_column_df = output_results_tsv_df[header_column] 4612 header_column_df_dtype = header_column_df.dtype 4613 if header_column_df_dtype == object: 4614 if ( 4615 pd.to_numeric(header_column_df, errors="coerce") 4616 .notnull() 4617 .all() 4618 ): 4619 header_info_type = "Float" 4620 else: 4621 header_info_type = "Integer" 4622 4623 # Header info 4624 characters_to_validate = ["-"] 4625 pattern = "[" + "".join(characters_to_validate) + "]" 4626 header_info_name = re.sub( 4627 pattern, 4628 "_", 4629 f"Exomiser_{header_column}".replace("#", ""), 4630 ) 4631 header_info_number = "." 4632 header_info_description = ( 4633 f"Exomiser {header_column} annotation" 4634 ) 4635 header_info_source = "Exomiser" 4636 header_info_version = "unknown" 4637 header_info_code = CODE_TYPE_MAP[header_info_type] 4638 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4639 header_info_name, 4640 header_info_number, 4641 header_info_type, 4642 header_info_description, 4643 header_info_source, 4644 header_info_version, 4645 header_info_code, 4646 ) 4647 4648 # Add field to add for update to concat fields 4649 sql_query_update_concat_fields.append( 4650 f""" 4651 CASE 4652 WHEN table_parquet."{header_column}" NOT IN ('','.') 4653 THEN concat( 4654 '{header_info_name}=', 4655 table_parquet."{header_column}", 4656 ';' 4657 ) 4658 4659 ELSE '' 4660 END 4661 """ 4662 ) 4663 4664 # Update query 4665 sql_query_update = f""" 4666 UPDATE {table_variants} as table_variants 4667 SET INFO = concat( 4668 CASE 4669 WHEN INFO NOT IN ('', '.') 4670 THEN INFO 4671 ELSE '' 4672 END, 4673 CASE 4674 WHEN table_variants.INFO NOT IN ('','.') 4675 THEN ';' 4676 ELSE '' 4677 END, 4678 ( 4679 SELECT 4680 concat( 4681 {",".join(sql_query_update_concat_fields)} 4682 ) 4683 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4684 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4685 AND table_parquet.\"START\" = table_variants.\"POS\" 4686 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4687 AND table_parquet.\"REF\" = table_variants.\"REF\" 4688 ) 4689 ) 4690 ; 4691 """ 4692 4693 # Update 4694 self.conn.execute(sql_query_update) 4695 4696 ### Annotate with VCF INFO field ### 4697 4698 # Init result VCF file 4699 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4700 4701 # If VCF exists 4702 if os.path.exists(output_results_vcf): 4703 4704 # Log 4705 log.debug("Exomiser result VCF update variants") 4706 4707 # Find Exomiser INFO field annotation in header 4708 with gzip.open(output_results_vcf, "rt") as f: 4709 header_list = self.read_vcf_header(f) 4710 exomiser_vcf_header = vcf.Reader( 4711 io.StringIO("\n".join(header_list)) 4712 ) 4713 4714 # Add annotation INFO field to header 4715 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4716 4717 # Update variants with VCF 4718 self.update_from_vcf(output_results_vcf) 4719 4720 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
4722 def annotation_snpeff(self, threads: int = None) -> None: 4723 """ 4724 This function annotate with snpEff 4725 4726 :param threads: The number of threads to use 4727 :return: the value of the variable "return_value". 4728 """ 4729 4730 # DEBUG 4731 log.debug("Start annotation with snpeff databases") 4732 4733 # Threads 4734 if not threads: 4735 threads = self.get_threads() 4736 log.debug("Threads: " + str(threads)) 4737 4738 # DEBUG 4739 delete_tmp = True 4740 if self.get_config().get("verbosity", "warning") in ["debug"]: 4741 delete_tmp = False 4742 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4743 4744 # Config 4745 config = self.get_config() 4746 log.debug("Config: " + str(config)) 4747 4748 # Config - Folders - Databases 4749 databases_folders = ( 4750 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4751 ) 4752 log.debug("Databases annotations: " + str(databases_folders)) 4753 4754 # # Config - Java 4755 # java_bin = get_bin( 4756 # tool="java", 4757 # bin="java", 4758 # bin_type="bin", 4759 # config=config, 4760 # default_folder="/usr/bin", 4761 # ) 4762 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4763 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4764 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4765 4766 # # Config - snpEff bin 4767 # snpeff_jar = get_bin( 4768 # tool="snpeff", 4769 # bin="snpEff.jar", 4770 # bin_type="jar", 4771 # config=config, 4772 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4773 # ) 4774 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4775 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4776 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4777 4778 # Config - snpEff bin command 4779 snpeff_bin_command = get_bin_command( 4780 bin="snpEff.jar", 4781 tool="snpeff", 4782 bin_type="jar", 4783 config=config, 4784 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4785 ) 4786 if not snpeff_bin_command: 4787 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4788 log.error(msg_err) 4789 raise ValueError(msg_err) 4790 4791 # Config - snpEff databases 4792 snpeff_databases = ( 4793 config.get("folders", {}) 4794 .get("databases", {}) 4795 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4796 ) 4797 snpeff_databases = full_path(snpeff_databases) 4798 if snpeff_databases is not None and snpeff_databases != "": 4799 log.debug(f"Create snpEff databases folder") 4800 if not os.path.exists(snpeff_databases): 4801 os.makedirs(snpeff_databases) 4802 4803 # Param 4804 param = self.get_param() 4805 log.debug("Param: " + str(param)) 4806 4807 # Param 4808 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4809 log.debug("Options: " + str(options)) 4810 4811 # Param - Assembly 4812 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4813 4814 # Param - Options 4815 snpeff_options = ( 4816 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4817 ) 4818 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4819 snpeff_csvstats = ( 4820 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4821 ) 4822 if snpeff_stats: 4823 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4824 snpeff_stats = full_path(snpeff_stats) 4825 snpeff_options += f" -stats {snpeff_stats}" 4826 if snpeff_csvstats: 4827 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4828 snpeff_csvstats = full_path(snpeff_csvstats) 4829 snpeff_options += f" -csvStats {snpeff_csvstats}" 4830 4831 # Data 4832 table_variants = self.get_table_variants() 4833 4834 # Check if not empty 4835 log.debug("Check if not empty") 4836 sql_query_chromosomes = ( 4837 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4838 ) 4839 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4840 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4841 log.info(f"VCF empty") 4842 return 4843 4844 # Export in VCF 4845 log.debug("Create initial file to annotate") 4846 tmp_vcf = NamedTemporaryFile( 4847 prefix=self.get_prefix(), 4848 dir=self.get_tmp_dir(), 4849 suffix=".vcf.gz", 4850 delete=True, 4851 ) 4852 tmp_vcf_name = tmp_vcf.name 4853 4854 # VCF header 4855 vcf_reader = self.get_header() 4856 log.debug("Initial header: " + str(vcf_reader.infos)) 4857 4858 # Existing annotations 4859 for vcf_annotation in self.get_header().infos: 4860 4861 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4862 log.debug( 4863 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4864 ) 4865 4866 # Memory limit 4867 # if config.get("memory", None): 4868 # memory_limit = config.get("memory", "8G") 4869 # else: 4870 # memory_limit = "8G" 4871 memory_limit = self.get_memory("8G") 4872 log.debug(f"memory_limit: {memory_limit}") 4873 4874 # snpEff java options 4875 snpeff_java_options = ( 4876 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4877 ) 4878 log.debug(f"Exomiser java options: {snpeff_java_options}") 4879 4880 force_update_annotation = True 4881 4882 if "ANN" not in self.get_header().infos or force_update_annotation: 4883 4884 # Check snpEff database 4885 log.debug(f"Check snpEff databases {[assembly]}") 4886 databases_download_snpeff( 4887 folder=snpeff_databases, assemblies=[assembly], config=config 4888 ) 4889 4890 # Export VCF file 4891 self.export_variant_vcf( 4892 vcf_file=tmp_vcf_name, 4893 remove_info=True, 4894 add_samples=False, 4895 index=True, 4896 ) 4897 4898 # Tmp file 4899 err_files = [] 4900 tmp_annotate_vcf = NamedTemporaryFile( 4901 prefix=self.get_prefix(), 4902 dir=self.get_tmp_dir(), 4903 suffix=".vcf", 4904 delete=False, 4905 ) 4906 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4907 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4908 err_files.append(tmp_annotate_vcf_name_err) 4909 4910 # Command 4911 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4912 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4913 run_parallel_commands([snpeff_command], 1) 4914 4915 # Error messages 4916 log.info(f"Error/Warning messages:") 4917 error_message_command_all = [] 4918 error_message_command_warning = [] 4919 error_message_command_err = [] 4920 for err_file in err_files: 4921 with open(err_file, "r") as f: 4922 for line in f: 4923 message = line.strip() 4924 error_message_command_all.append(message) 4925 if line.startswith("[W::"): 4926 error_message_command_warning.append(message) 4927 if line.startswith("[E::"): 4928 error_message_command_err.append(f"{err_file}: " + message) 4929 # log info 4930 for message in list( 4931 set(error_message_command_err + error_message_command_warning) 4932 ): 4933 log.info(f" {message}") 4934 # debug info 4935 for message in list(set(error_message_command_all)): 4936 log.debug(f" {message}") 4937 # failed 4938 if len(error_message_command_err): 4939 log.error("Annotation failed: Error in commands") 4940 raise ValueError("Annotation failed: Error in commands") 4941 4942 # Find annotation in header 4943 with open(tmp_annotate_vcf_name, "rt") as f: 4944 header_list = self.read_vcf_header(f) 4945 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4946 4947 for ann in annovar_vcf_header.infos: 4948 if ann not in self.get_header().infos: 4949 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4950 4951 # Update variants 4952 log.info(f"Annotation - Updating...") 4953 self.update_from_vcf(tmp_annotate_vcf_name) 4954 4955 else: 4956 if "ANN" in self.get_header().infos: 4957 log.debug(f"Existing snpEff annotations in VCF") 4958 if force_update_annotation: 4959 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
4961 def annotation_annovar(self, threads: int = None) -> None: 4962 """ 4963 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4964 annotations 4965 4966 :param threads: number of threads to use 4967 :return: the value of the variable "return_value". 4968 """ 4969 4970 # DEBUG 4971 log.debug("Start annotation with Annovar databases") 4972 4973 # Threads 4974 if not threads: 4975 threads = self.get_threads() 4976 log.debug("Threads: " + str(threads)) 4977 4978 # Tmp en Err files 4979 tmp_files = [] 4980 err_files = [] 4981 4982 # DEBUG 4983 delete_tmp = True 4984 if self.get_config().get("verbosity", "warning") in ["debug"]: 4985 delete_tmp = False 4986 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4987 4988 # Config 4989 config = self.get_config() 4990 log.debug("Config: " + str(config)) 4991 4992 # Config - Folders - Databases 4993 databases_folders = ( 4994 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4995 ) 4996 log.debug("Databases annotations: " + str(databases_folders)) 4997 4998 # Config - annovar bin command 4999 annovar_bin_command = get_bin_command( 5000 bin="table_annovar.pl", 5001 tool="annovar", 5002 bin_type="perl", 5003 config=config, 5004 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5005 ) 5006 if not annovar_bin_command: 5007 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5008 log.error(msg_err) 5009 raise ValueError(msg_err) 5010 5011 # Config - BCFTools bin command 5012 bcftools_bin_command = get_bin_command( 5013 bin="bcftools", 5014 tool="bcftools", 5015 bin_type="bin", 5016 config=config, 5017 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5018 ) 5019 if not bcftools_bin_command: 5020 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5021 log.error(msg_err) 5022 raise ValueError(msg_err) 5023 5024 # Config - annovar databases 5025 annovar_databases = ( 5026 config.get("folders", {}) 5027 .get("databases", {}) 5028 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5029 ) 5030 annovar_databases = full_path(annovar_databases) 5031 if annovar_databases != "" and not os.path.exists(annovar_databases): 5032 os.makedirs(annovar_databases) 5033 5034 # Param 5035 param = self.get_param() 5036 log.debug("Param: " + str(param)) 5037 5038 # Param - options 5039 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5040 log.debug("Options: " + str(options)) 5041 5042 # Param - annotations 5043 annotations = ( 5044 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5045 ) 5046 log.debug("Annotations: " + str(annotations)) 5047 5048 # Param - Assembly 5049 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5050 5051 # Annovar database assembly 5052 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5053 if annovar_databases_assembly != "" and not os.path.exists( 5054 annovar_databases_assembly 5055 ): 5056 os.makedirs(annovar_databases_assembly) 5057 5058 # Data 5059 table_variants = self.get_table_variants() 5060 5061 # Check if not empty 5062 log.debug("Check if not empty") 5063 sql_query_chromosomes = ( 5064 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5065 ) 5066 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5067 if not sql_query_chromosomes_df["count"][0]: 5068 log.info(f"VCF empty") 5069 return 5070 5071 # VCF header 5072 vcf_reader = self.get_header() 5073 log.debug("Initial header: " + str(vcf_reader.infos)) 5074 5075 # Existing annotations 5076 for vcf_annotation in self.get_header().infos: 5077 5078 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5079 log.debug( 5080 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5081 ) 5082 5083 force_update_annotation = True 5084 5085 if annotations: 5086 5087 commands = [] 5088 tmp_annotates_vcf_name_list = [] 5089 5090 # Export in VCF 5091 log.debug("Create initial file to annotate") 5092 tmp_vcf = NamedTemporaryFile( 5093 prefix=self.get_prefix(), 5094 dir=self.get_tmp_dir(), 5095 suffix=".vcf.gz", 5096 delete=False, 5097 ) 5098 tmp_vcf_name = tmp_vcf.name 5099 tmp_files.append(tmp_vcf_name) 5100 tmp_files.append(tmp_vcf_name + ".tbi") 5101 5102 # Export VCF file 5103 self.export_variant_vcf( 5104 vcf_file=tmp_vcf_name, 5105 remove_info=".", 5106 add_samples=False, 5107 index=True, 5108 ) 5109 5110 # Create file for field rename 5111 log.debug("Create file for field rename") 5112 tmp_rename = NamedTemporaryFile( 5113 prefix=self.get_prefix(), 5114 dir=self.get_tmp_dir(), 5115 suffix=".rename", 5116 delete=False, 5117 ) 5118 tmp_rename_name = tmp_rename.name 5119 tmp_files.append(tmp_rename_name) 5120 5121 # Check Annovar database 5122 log.debug( 5123 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5124 ) 5125 databases_download_annovar( 5126 folder=annovar_databases, 5127 files=list(annotations.keys()), 5128 assemblies=[assembly], 5129 ) 5130 5131 for annotation in annotations: 5132 annotation_fields = annotations[annotation] 5133 5134 if not annotation_fields: 5135 annotation_fields = {"INFO": None} 5136 5137 log.info(f"Annotations Annovar - database '{annotation}'") 5138 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5139 5140 # Tmp file for annovar 5141 err_files = [] 5142 tmp_annotate_vcf_directory = TemporaryDirectory( 5143 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5144 ) 5145 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5146 tmp_annotate_vcf_name_annovar = ( 5147 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5148 ) 5149 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5150 err_files.append(tmp_annotate_vcf_name_err) 5151 tmp_files.append(tmp_annotate_vcf_name_err) 5152 5153 # Tmp file final vcf annotated by annovar 5154 tmp_annotate_vcf = NamedTemporaryFile( 5155 prefix=self.get_prefix(), 5156 dir=self.get_tmp_dir(), 5157 suffix=".vcf.gz", 5158 delete=False, 5159 ) 5160 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5161 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5162 tmp_files.append(tmp_annotate_vcf_name) 5163 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5164 5165 # Number of fields 5166 annotation_list = [] 5167 annotation_renamed_list = [] 5168 5169 for annotation_field in annotation_fields: 5170 5171 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5172 annotation_fields_new_name = annotation_fields.get( 5173 annotation_field, annotation_field 5174 ) 5175 if not annotation_fields_new_name: 5176 annotation_fields_new_name = annotation_field 5177 5178 if ( 5179 force_update_annotation 5180 or annotation_fields_new_name not in self.get_header().infos 5181 ): 5182 annotation_list.append(annotation_field) 5183 annotation_renamed_list.append(annotation_fields_new_name) 5184 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5185 log.warning( 5186 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5187 ) 5188 5189 # Add rename info 5190 run_parallel_commands( 5191 [ 5192 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5193 ], 5194 1, 5195 ) 5196 5197 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5198 log.debug("annotation_list: " + str(annotation_list)) 5199 5200 # protocol 5201 protocol = annotation 5202 5203 # argument 5204 argument = "" 5205 5206 # operation 5207 operation = "f" 5208 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5209 "ensGene" 5210 ): 5211 operation = "g" 5212 if options.get("genebase", None): 5213 argument = f"""'{options.get("genebase","")}'""" 5214 elif annotation in ["cytoBand"]: 5215 operation = "r" 5216 5217 # argument option 5218 argument_option = "" 5219 if argument != "": 5220 argument_option = " --argument " + argument 5221 5222 # command options 5223 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5224 for option in options: 5225 if option not in ["genebase"]: 5226 command_options += f""" --{option}={options[option]}""" 5227 5228 # Command 5229 5230 # Command - Annovar 5231 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5232 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5233 5234 # Command - start pipe 5235 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5236 5237 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5238 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5239 5240 # Command - Special characters (refGene annotation) 5241 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5242 5243 # Command - Clean empty fields (with value ".") 5244 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5245 5246 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5247 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5248 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5249 # for ann in annotation_renamed_list: 5250 for ann in annotation_list: 5251 annovar_fields_to_keep.append(f"^INFO/{ann}") 5252 5253 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5254 5255 # Command - indexing 5256 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5257 5258 log.debug(f"Annotation - Annovar command: {command_annovar}") 5259 run_parallel_commands([command_annovar], 1) 5260 5261 # Error messages 5262 log.info(f"Error/Warning messages:") 5263 error_message_command_all = [] 5264 error_message_command_warning = [] 5265 error_message_command_err = [] 5266 for err_file in err_files: 5267 with open(err_file, "r") as f: 5268 for line in f: 5269 message = line.strip() 5270 error_message_command_all.append(message) 5271 if line.startswith("[W::") or line.startswith("WARNING"): 5272 error_message_command_warning.append(message) 5273 if line.startswith("[E::") or line.startswith("ERROR"): 5274 error_message_command_err.append( 5275 f"{err_file}: " + message 5276 ) 5277 # log info 5278 for message in list( 5279 set(error_message_command_err + error_message_command_warning) 5280 ): 5281 log.info(f" {message}") 5282 # debug info 5283 for message in list(set(error_message_command_all)): 5284 log.debug(f" {message}") 5285 # failed 5286 if len(error_message_command_err): 5287 log.error("Annotation failed: Error in commands") 5288 raise ValueError("Annotation failed: Error in commands") 5289 5290 if tmp_annotates_vcf_name_list: 5291 5292 # List of annotated files 5293 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5294 5295 # Tmp file 5296 tmp_annotate_vcf = NamedTemporaryFile( 5297 prefix=self.get_prefix(), 5298 dir=self.get_tmp_dir(), 5299 suffix=".vcf.gz", 5300 delete=False, 5301 ) 5302 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5303 tmp_files.append(tmp_annotate_vcf_name) 5304 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5305 err_files.append(tmp_annotate_vcf_name_err) 5306 tmp_files.append(tmp_annotate_vcf_name_err) 5307 5308 # Command merge 5309 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5310 log.info( 5311 f"Annotation Annovar - Annotation merging " 5312 + str(len(tmp_annotates_vcf_name_list)) 5313 + " annotated files" 5314 ) 5315 log.debug(f"Annotation - merge command: {merge_command}") 5316 run_parallel_commands([merge_command], 1) 5317 5318 # Find annotation in header 5319 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5320 header_list = self.read_vcf_header(f) 5321 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5322 5323 for ann in annovar_vcf_header.infos: 5324 if ann not in self.get_header().infos: 5325 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5326 5327 # Update variants 5328 log.info(f"Annotation Annovar - Updating...") 5329 self.update_from_vcf(tmp_annotate_vcf_name) 5330 5331 # Clean files 5332 # Tmp file remove command 5333 if True: 5334 tmp_files_remove_command = "" 5335 if tmp_files: 5336 tmp_files_remove_command = " ".join(tmp_files) 5337 clean_command = f" rm -f {tmp_files_remove_command} " 5338 log.debug(f"Annotation Annovar - Annotation cleaning ") 5339 log.debug(f"Annotation - cleaning command: {clean_command}") 5340 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5343 def annotation_parquet(self, threads: int = None) -> None: 5344 """ 5345 It takes a VCF file, and annotates it with a parquet file 5346 5347 :param threads: number of threads to use for the annotation 5348 :return: the value of the variable "result". 5349 """ 5350 5351 # DEBUG 5352 log.debug("Start annotation with parquet databases") 5353 5354 # Threads 5355 if not threads: 5356 threads = self.get_threads() 5357 log.debug("Threads: " + str(threads)) 5358 5359 # DEBUG 5360 delete_tmp = True 5361 if self.get_config().get("verbosity", "warning") in ["debug"]: 5362 delete_tmp = False 5363 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5364 5365 # Config 5366 databases_folders = set( 5367 self.get_config() 5368 .get("folders", {}) 5369 .get("databases", {}) 5370 .get("annotations", ["."]) 5371 + self.get_config() 5372 .get("folders", {}) 5373 .get("databases", {}) 5374 .get("parquet", ["."]) 5375 ) 5376 log.debug("Databases annotations: " + str(databases_folders)) 5377 5378 # Param 5379 annotations = ( 5380 self.get_param() 5381 .get("annotation", {}) 5382 .get("parquet", {}) 5383 .get("annotations", None) 5384 ) 5385 log.debug("Annotations: " + str(annotations)) 5386 5387 # Assembly 5388 assembly = self.get_param().get( 5389 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5390 ) 5391 5392 # Force Update Annotation 5393 force_update_annotation = ( 5394 self.get_param() 5395 .get("annotation", {}) 5396 .get("options", {}) 5397 .get("annotations_update", False) 5398 ) 5399 log.debug(f"force_update_annotation={force_update_annotation}") 5400 force_append_annotation = ( 5401 self.get_param() 5402 .get("annotation", {}) 5403 .get("options", {}) 5404 .get("annotations_append", False) 5405 ) 5406 log.debug(f"force_append_annotation={force_append_annotation}") 5407 5408 # Data 5409 table_variants = self.get_table_variants() 5410 5411 # Check if not empty 5412 log.debug("Check if not empty") 5413 sql_query_chromosomes_df = self.get_query_to_df( 5414 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5415 ) 5416 if not sql_query_chromosomes_df["count"][0]: 5417 log.info(f"VCF empty") 5418 return 5419 5420 # VCF header 5421 vcf_reader = self.get_header() 5422 log.debug("Initial header: " + str(vcf_reader.infos)) 5423 5424 # Nb Variants POS 5425 log.debug("NB Variants Start") 5426 nb_variants = self.conn.execute( 5427 f"SELECT count(*) AS count FROM variants" 5428 ).fetchdf()["count"][0] 5429 log.debug("NB Variants Stop") 5430 5431 # Existing annotations 5432 for vcf_annotation in self.get_header().infos: 5433 5434 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5435 log.debug( 5436 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5437 ) 5438 5439 # Added columns 5440 added_columns = [] 5441 5442 # drop indexes 5443 log.debug(f"Drop indexes...") 5444 self.drop_indexes() 5445 5446 if annotations: 5447 5448 if "ALL" in annotations: 5449 5450 all_param = annotations.get("ALL", {}) 5451 all_param_formats = all_param.get("formats", None) 5452 all_param_releases = all_param.get("releases", None) 5453 5454 databases_infos_dict = self.scan_databases( 5455 database_formats=all_param_formats, 5456 database_releases=all_param_releases, 5457 ) 5458 for database_infos in databases_infos_dict.keys(): 5459 if database_infos not in annotations: 5460 annotations[database_infos] = {"INFO": None} 5461 5462 for annotation in annotations: 5463 5464 if annotation in ["ALL"]: 5465 continue 5466 5467 # Annotation Name 5468 annotation_name = os.path.basename(annotation) 5469 5470 # Annotation fields 5471 annotation_fields = annotations[annotation] 5472 if not annotation_fields: 5473 annotation_fields = {"INFO": None} 5474 5475 log.debug(f"Annotation '{annotation_name}'") 5476 log.debug( 5477 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5478 ) 5479 5480 # Create Database 5481 database = Database( 5482 database=annotation, 5483 databases_folders=databases_folders, 5484 assembly=assembly, 5485 ) 5486 5487 # Find files 5488 parquet_file = database.get_database() 5489 parquet_hdr_file = database.get_header_file() 5490 parquet_type = database.get_type() 5491 5492 # Check if files exists 5493 if not parquet_file or not parquet_hdr_file: 5494 log.error("Annotation failed: file not found") 5495 raise ValueError("Annotation failed: file not found") 5496 else: 5497 # Get parquet connexion 5498 parquet_sql_attach = database.get_sql_database_attach( 5499 output="query" 5500 ) 5501 if parquet_sql_attach: 5502 self.conn.execute(parquet_sql_attach) 5503 parquet_file_link = database.get_sql_database_link() 5504 # Log 5505 log.debug( 5506 f"Annotation '{annotation_name}' - file: " 5507 + str(parquet_file) 5508 + " and " 5509 + str(parquet_hdr_file) 5510 ) 5511 5512 # Database full header columns 5513 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5514 parquet_hdr_file 5515 ) 5516 # Log 5517 log.debug( 5518 "Annotation database header columns : " 5519 + str(parquet_hdr_vcf_header_columns) 5520 ) 5521 5522 # Load header as VCF object 5523 parquet_hdr_vcf_header_infos = database.get_header().infos 5524 # Log 5525 log.debug( 5526 "Annotation database header: " 5527 + str(parquet_hdr_vcf_header_infos) 5528 ) 5529 5530 # Get extra infos 5531 parquet_columns = database.get_extra_columns() 5532 # Log 5533 log.debug("Annotation database Columns: " + str(parquet_columns)) 5534 5535 # Add extra columns if "ALL" in annotation_fields 5536 # if "ALL" in annotation_fields: 5537 # allow_add_extra_column = True 5538 if "ALL" in annotation_fields and database.get_extra_columns(): 5539 for extra_column in database.get_extra_columns(): 5540 if ( 5541 extra_column not in annotation_fields 5542 and extra_column.replace("INFO/", "") 5543 not in parquet_hdr_vcf_header_infos 5544 ): 5545 parquet_hdr_vcf_header_infos[extra_column] = ( 5546 vcf.parser._Info( 5547 extra_column, 5548 ".", 5549 "String", 5550 f"{extra_column} description", 5551 "unknown", 5552 "unknown", 5553 self.code_type_map["String"], 5554 ) 5555 ) 5556 5557 # For all fields in database 5558 annotation_fields_all = False 5559 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5560 annotation_fields_all = True 5561 annotation_fields = { 5562 key: key for key in parquet_hdr_vcf_header_infos 5563 } 5564 5565 log.debug( 5566 "Annotation database header - All annotations added: " 5567 + str(annotation_fields) 5568 ) 5569 5570 # Init 5571 5572 # List of annotation fields to use 5573 sql_query_annotation_update_info_sets = [] 5574 5575 # List of annotation to agregate 5576 sql_query_annotation_to_agregate = [] 5577 5578 # Number of fields 5579 nb_annotation_field = 0 5580 5581 # Annotation fields processed 5582 annotation_fields_processed = [] 5583 5584 # Columns mapping 5585 map_columns = database.map_columns( 5586 columns=annotation_fields, prefixes=["INFO/"] 5587 ) 5588 5589 # Query dict for fields to remove (update option) 5590 query_dict_remove = {} 5591 5592 # Fetch Anotation fields 5593 for annotation_field in annotation_fields: 5594 5595 # annotation_field_column 5596 annotation_field_column = map_columns.get( 5597 annotation_field, "INFO" 5598 ) 5599 5600 # field new name, if parametered 5601 annotation_fields_new_name = annotation_fields.get( 5602 annotation_field, annotation_field 5603 ) 5604 if not annotation_fields_new_name: 5605 annotation_fields_new_name = annotation_field 5606 5607 # To annotate 5608 # force_update_annotation = True 5609 # force_append_annotation = True 5610 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5611 if annotation_field in parquet_hdr_vcf_header_infos and ( 5612 force_update_annotation 5613 or force_append_annotation 5614 or ( 5615 annotation_fields_new_name 5616 not in self.get_header().infos 5617 ) 5618 ): 5619 5620 # Add field to annotation to process list 5621 annotation_fields_processed.append( 5622 annotation_fields_new_name 5623 ) 5624 5625 # explode infos for the field 5626 annotation_fields_new_name_info_msg = "" 5627 if ( 5628 force_update_annotation 5629 and annotation_fields_new_name 5630 in self.get_header().infos 5631 ): 5632 # Remove field from INFO 5633 query = f""" 5634 UPDATE {table_variants} as table_variants 5635 SET INFO = REGEXP_REPLACE( 5636 concat(table_variants.INFO,''), 5637 ';*{annotation_fields_new_name}=[^;]*', 5638 '' 5639 ) 5640 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5641 """ 5642 annotation_fields_new_name_info_msg = " [update]" 5643 query_dict_remove[ 5644 f"remove 'INFO/{annotation_fields_new_name}'" 5645 ] = query 5646 5647 # Sep between fields in INFO 5648 nb_annotation_field += 1 5649 if nb_annotation_field > 1: 5650 annotation_field_sep = ";" 5651 else: 5652 annotation_field_sep = "" 5653 5654 log.info( 5655 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5656 ) 5657 5658 # Add INFO field to header 5659 parquet_hdr_vcf_header_infos_number = ( 5660 parquet_hdr_vcf_header_infos[annotation_field].num 5661 or "." 5662 ) 5663 parquet_hdr_vcf_header_infos_type = ( 5664 parquet_hdr_vcf_header_infos[annotation_field].type 5665 or "String" 5666 ) 5667 parquet_hdr_vcf_header_infos_description = ( 5668 parquet_hdr_vcf_header_infos[annotation_field].desc 5669 or f"{annotation_field} description" 5670 ) 5671 parquet_hdr_vcf_header_infos_source = ( 5672 parquet_hdr_vcf_header_infos[annotation_field].source 5673 or "unknown" 5674 ) 5675 parquet_hdr_vcf_header_infos_version = ( 5676 parquet_hdr_vcf_header_infos[annotation_field].version 5677 or "unknown" 5678 ) 5679 5680 vcf_reader.infos[annotation_fields_new_name] = ( 5681 vcf.parser._Info( 5682 annotation_fields_new_name, 5683 parquet_hdr_vcf_header_infos_number, 5684 parquet_hdr_vcf_header_infos_type, 5685 parquet_hdr_vcf_header_infos_description, 5686 parquet_hdr_vcf_header_infos_source, 5687 parquet_hdr_vcf_header_infos_version, 5688 self.code_type_map[ 5689 parquet_hdr_vcf_header_infos_type 5690 ], 5691 ) 5692 ) 5693 5694 # Append 5695 if force_append_annotation: 5696 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5697 else: 5698 query_case_when_append = "" 5699 5700 # Annotation/Update query fields 5701 # Found in INFO column 5702 if ( 5703 annotation_field_column == "INFO" 5704 and "INFO" in parquet_hdr_vcf_header_columns 5705 ): 5706 sql_query_annotation_update_info_sets.append( 5707 f""" 5708 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5709 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5710 ELSE '' 5711 END 5712 """ 5713 ) 5714 # Found in a specific column 5715 else: 5716 sql_query_annotation_update_info_sets.append( 5717 f""" 5718 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5719 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5720 ELSE '' 5721 END 5722 """ 5723 ) 5724 sql_query_annotation_to_agregate.append( 5725 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5726 ) 5727 5728 # Not to annotate 5729 else: 5730 5731 if force_update_annotation: 5732 annotation_message = "forced" 5733 else: 5734 annotation_message = "skipped" 5735 5736 if annotation_field not in parquet_hdr_vcf_header_infos: 5737 log.warning( 5738 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5739 ) 5740 if annotation_fields_new_name in self.get_header().infos: 5741 log.warning( 5742 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5743 ) 5744 5745 # Check if ALL fields have to be annotated. Thus concat all INFO field 5746 # allow_annotation_full_info = True 5747 allow_annotation_full_info = not force_append_annotation 5748 5749 if parquet_type in ["regions"]: 5750 allow_annotation_full_info = False 5751 5752 if ( 5753 allow_annotation_full_info 5754 and nb_annotation_field == len(annotation_fields) 5755 and annotation_fields_all 5756 and ( 5757 "INFO" in parquet_hdr_vcf_header_columns 5758 and "INFO" in database.get_extra_columns() 5759 ) 5760 ): 5761 log.debug("Column INFO annotation enabled") 5762 sql_query_annotation_update_info_sets = [] 5763 sql_query_annotation_update_info_sets.append( 5764 f" table_parquet.INFO " 5765 ) 5766 5767 if sql_query_annotation_update_info_sets: 5768 5769 # Annotate 5770 log.info(f"Annotation '{annotation_name}' - Annotation...") 5771 5772 # Join query annotation update info sets for SQL 5773 sql_query_annotation_update_info_sets_sql = ",".join( 5774 sql_query_annotation_update_info_sets 5775 ) 5776 5777 # Check chromosomes list (and variants infos) 5778 sql_query_chromosomes = f""" 5779 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5780 FROM {table_variants} as table_variants 5781 GROUP BY table_variants."#CHROM" 5782 ORDER BY table_variants."#CHROM" 5783 """ 5784 sql_query_chromosomes_df = self.conn.execute( 5785 sql_query_chromosomes 5786 ).df() 5787 sql_query_chromosomes_dict = { 5788 entry["CHROM"]: { 5789 "count": entry["count_variants"], 5790 "min": entry["min_variants"], 5791 "max": entry["max_variants"], 5792 } 5793 for index, entry in sql_query_chromosomes_df.iterrows() 5794 } 5795 5796 # Init 5797 nb_of_query = 0 5798 nb_of_variant_annotated = 0 5799 query_dict = query_dict_remove 5800 5801 # for chrom in sql_query_chromosomes_df["CHROM"]: 5802 for chrom in sql_query_chromosomes_dict: 5803 5804 # Number of variant by chromosome 5805 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5806 chrom, {} 5807 ).get("count", 0) 5808 5809 log.debug( 5810 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5811 ) 5812 5813 # Annotation with regions database 5814 if parquet_type in ["regions"]: 5815 sql_query_annotation_from_clause = f""" 5816 FROM ( 5817 SELECT 5818 '{chrom}' AS \"#CHROM\", 5819 table_variants_from.\"POS\" AS \"POS\", 5820 {",".join(sql_query_annotation_to_agregate)} 5821 FROM {table_variants} as table_variants_from 5822 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5823 table_parquet_from."#CHROM" = '{chrom}' 5824 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5825 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5826 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5827 ) 5828 ) 5829 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5830 GROUP BY table_variants_from.\"POS\" 5831 ) 5832 as table_parquet 5833 """ 5834 5835 sql_query_annotation_where_clause = """ 5836 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5837 AND table_parquet.\"POS\" = table_variants.\"POS\" 5838 """ 5839 5840 # Annotation with variants database 5841 else: 5842 sql_query_annotation_from_clause = f""" 5843 FROM {parquet_file_link} as table_parquet 5844 """ 5845 sql_query_annotation_where_clause = f""" 5846 table_variants."#CHROM" = '{chrom}' 5847 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5848 AND table_parquet.\"POS\" = table_variants.\"POS\" 5849 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5850 AND table_parquet.\"REF\" = table_variants.\"REF\" 5851 """ 5852 5853 # Create update query 5854 sql_query_annotation_chrom_interval_pos = f""" 5855 UPDATE {table_variants} as table_variants 5856 SET INFO = 5857 concat( 5858 CASE WHEN table_variants.INFO NOT IN ('','.') 5859 THEN table_variants.INFO 5860 ELSE '' 5861 END 5862 , 5863 CASE WHEN table_variants.INFO NOT IN ('','.') 5864 AND ( 5865 concat({sql_query_annotation_update_info_sets_sql}) 5866 ) 5867 NOT IN ('','.') 5868 THEN ';' 5869 ELSE '' 5870 END 5871 , 5872 {sql_query_annotation_update_info_sets_sql} 5873 ) 5874 {sql_query_annotation_from_clause} 5875 WHERE {sql_query_annotation_where_clause} 5876 ; 5877 """ 5878 5879 # Add update query to dict 5880 query_dict[ 5881 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5882 ] = sql_query_annotation_chrom_interval_pos 5883 5884 nb_of_query = len(query_dict) 5885 num_query = 0 5886 5887 # SET max_expression_depth TO x 5888 self.conn.execute("SET max_expression_depth TO 10000") 5889 5890 for query_name in query_dict: 5891 query = query_dict[query_name] 5892 num_query += 1 5893 log.info( 5894 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5895 ) 5896 result = self.conn.execute(query) 5897 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5898 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5899 log.info( 5900 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5901 ) 5902 5903 log.info( 5904 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5905 ) 5906 5907 else: 5908 5909 log.info( 5910 f"Annotation '{annotation_name}' - No Annotations available" 5911 ) 5912 5913 log.debug("Final header: " + str(vcf_reader.infos)) 5914 5915 # Remove added columns 5916 for added_column in added_columns: 5917 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
5919 def annotation_splice(self, threads: int = None) -> None: 5920 """ 5921 This function annotate with snpEff 5922 5923 :param threads: The number of threads to use 5924 :return: the value of the variable "return_value". 5925 """ 5926 5927 # DEBUG 5928 log.debug("Start annotation with splice tools") 5929 5930 # Threads 5931 if not threads: 5932 threads = self.get_threads() 5933 log.debug("Threads: " + str(threads)) 5934 5935 # DEBUG 5936 delete_tmp = True 5937 if self.get_config().get("verbosity", "warning") in ["debug"]: 5938 delete_tmp = False 5939 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5940 5941 # Config 5942 config = self.get_config() 5943 log.debug("Config: " + str(config)) 5944 splice_config = config.get("tools", {}).get("splice", {}) 5945 if not splice_config: 5946 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5947 if not splice_config: 5948 msg_err = "No Splice tool config" 5949 log.error(msg_err) 5950 raise ValueError(msg_err) 5951 log.debug(f"splice_config={splice_config}") 5952 5953 # Config - Folders - Databases 5954 databases_folders = ( 5955 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5956 ) 5957 log.debug("Databases annotations: " + str(databases_folders)) 5958 5959 # Splice docker image 5960 splice_docker_image = splice_config.get("docker").get("image") 5961 5962 # Pull splice image if it's not already there 5963 if not check_docker_image_exists(splice_docker_image): 5964 log.warning( 5965 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5966 ) 5967 try: 5968 command(f"docker pull {splice_config.get('docker').get('image')}") 5969 except subprocess.CalledProcessError: 5970 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5971 log.error(msg_err) 5972 raise ValueError(msg_err) 5973 return None 5974 5975 # Config - splice databases 5976 splice_databases = ( 5977 config.get("folders", {}) 5978 .get("databases", {}) 5979 .get("splice", DEFAULT_SPLICE_FOLDER) 5980 ) 5981 splice_databases = full_path(splice_databases) 5982 5983 # Param 5984 param = self.get_param() 5985 log.debug("Param: " + str(param)) 5986 5987 # Param 5988 options = param.get("annotation", {}).get("splice", {}) 5989 log.debug("Options: " + str(options)) 5990 5991 # Data 5992 table_variants = self.get_table_variants() 5993 5994 # Check if not empty 5995 log.debug("Check if not empty") 5996 sql_query_chromosomes = ( 5997 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5998 ) 5999 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6000 log.info("VCF empty") 6001 return None 6002 6003 # Export in VCF 6004 log.debug("Create initial file to annotate") 6005 6006 # Create output folder 6007 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6008 if not os.path.exists(output_folder): 6009 Path(output_folder).mkdir(parents=True, exist_ok=True) 6010 6011 # Create tmp VCF file 6012 tmp_vcf = NamedTemporaryFile( 6013 prefix=self.get_prefix(), 6014 dir=output_folder, 6015 suffix=".vcf", 6016 delete=False, 6017 ) 6018 tmp_vcf_name = tmp_vcf.name 6019 6020 # VCF header 6021 header = self.get_header() 6022 6023 # Existing annotations 6024 for vcf_annotation in self.get_header().infos: 6025 6026 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6027 log.debug( 6028 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6029 ) 6030 6031 # Memory limit 6032 if config.get("memory", None): 6033 memory_limit = config.get("memory", "8G").upper() 6034 # upper() 6035 else: 6036 memory_limit = "8G" 6037 log.debug(f"memory_limit: {memory_limit}") 6038 6039 # Check number of variants to annotate 6040 where_clause_regex_spliceai = r"SpliceAI_\w+" 6041 where_clause_regex_spip = r"SPiP_\w+" 6042 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6043 df_list_of_variants_to_annotate = self.get_query_to_df( 6044 query=f""" SELECT * FROM variants {where_clause} """ 6045 ) 6046 if len(df_list_of_variants_to_annotate) == 0: 6047 log.warning( 6048 f"No variants to annotate with splice. Variants probably already annotated with splice" 6049 ) 6050 return None 6051 else: 6052 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6053 6054 # Export VCF file 6055 self.export_variant_vcf( 6056 vcf_file=tmp_vcf_name, 6057 remove_info=True, 6058 add_samples=True, 6059 index=False, 6060 where_clause=where_clause, 6061 ) 6062 6063 # Create docker container and launch splice analysis 6064 if splice_config: 6065 6066 # Splice mount folders 6067 mount_folders = splice_config.get("mount", {}) 6068 6069 # Genome mount 6070 mount_folders[ 6071 config.get("folders", {}) 6072 .get("databases", {}) 6073 .get("genomes", DEFAULT_GENOME_FOLDER) 6074 ] = "ro" 6075 6076 # SpliceAI mount 6077 mount_folders[ 6078 config.get("folders", {}) 6079 .get("databases", {}) 6080 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6081 ] = "ro" 6082 6083 # Genome mount 6084 mount_folders[ 6085 config.get("folders", {}) 6086 .get("databases", {}) 6087 .get("spip", DEFAULT_SPIP_FOLDER) 6088 ] = "ro" 6089 6090 # Mount folders 6091 mount = [] 6092 6093 # Config mount 6094 mount = [ 6095 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6096 for path, mode in mount_folders.items() 6097 ] 6098 6099 if any(value for value in splice_config.values() if value is None): 6100 log.warning("At least one splice config parameter is empty") 6101 return None 6102 6103 # Params in splice nf 6104 def check_values(dico: dict): 6105 """ 6106 Ensure parameters for NF splice pipeline 6107 """ 6108 for key, val in dico.items(): 6109 if key == "genome": 6110 if any( 6111 assemb in options.get("genome", {}) 6112 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6113 ): 6114 yield f"--{key} hg19" 6115 elif any( 6116 assemb in options.get("genome", {}) 6117 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6118 ): 6119 yield f"--{key} hg38" 6120 elif ( 6121 (isinstance(val, str) and val) 6122 or isinstance(val, int) 6123 or isinstance(val, bool) 6124 ): 6125 yield f"--{key} {val}" 6126 6127 # Genome 6128 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6129 options["genome"] = genome 6130 6131 # NF params 6132 nf_params = [] 6133 6134 # Add options 6135 if options: 6136 nf_params = list(check_values(options)) 6137 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6138 else: 6139 log.debug("No NF params provided") 6140 6141 # Add threads 6142 if "threads" not in options.keys(): 6143 nf_params.append(f"--threads {threads}") 6144 6145 # Genome path 6146 genome_path = find_genome( 6147 config.get("folders", {}) 6148 .get("databases", {}) 6149 .get("genomes", DEFAULT_GENOME_FOLDER), 6150 file=f"{genome}.fa", 6151 ) 6152 # Add genome path 6153 if not genome_path: 6154 raise ValueError( 6155 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6156 ) 6157 else: 6158 log.debug(f"Genome: {genome_path}") 6159 nf_params.append(f"--genome_path {genome_path}") 6160 6161 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6162 """ 6163 Setting up updated databases for SPiP and SpliceAI 6164 """ 6165 6166 try: 6167 6168 # SpliceAI assembly transcriptome 6169 spliceai_assembly = os.path.join( 6170 config.get("folders", {}) 6171 .get("databases", {}) 6172 .get("spliceai", {}), 6173 options.get("genome"), 6174 "transcriptome", 6175 ) 6176 spip_assembly = options.get("genome") 6177 6178 spip = find( 6179 f"transcriptome_{spip_assembly}.RData", 6180 config.get("folders", {}).get("databases", {}).get("spip", {}), 6181 ) 6182 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6183 log.debug(f"SPiP annotations: {spip}") 6184 log.debug(f"SpliceAI annotations: {spliceai}") 6185 if spip and spliceai: 6186 return [ 6187 f"--spip_transcriptome {spip}", 6188 f"--spliceai_annotations {spliceai}", 6189 ] 6190 else: 6191 # TODO crash and go on with basic annotations ? 6192 # raise ValueError( 6193 # "Can't find splice databases in configuration EXIT" 6194 # ) 6195 log.warning( 6196 "Can't find splice databases in configuration, use annotations file from image" 6197 ) 6198 except TypeError: 6199 log.warning( 6200 "Can't find splice databases in configuration, use annotations file from image" 6201 ) 6202 return [] 6203 6204 # Add options, check if transcriptome option have already beend provided 6205 if ( 6206 "spip_transcriptome" not in nf_params 6207 and "spliceai_transcriptome" not in nf_params 6208 ): 6209 splice_reference = splice_annotations(options, config) 6210 if splice_reference: 6211 nf_params.extend(splice_reference) 6212 6213 nf_params.append(f"--output_folder {output_folder}") 6214 6215 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6216 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6217 log.debug(cmd) 6218 6219 splice_config["docker"]["command"] = cmd 6220 6221 docker_cmd = get_bin_command( 6222 tool="splice", 6223 bin_type="docker", 6224 config=config, 6225 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6226 add_options=f"--name {random_uuid} {' '.join(mount)}", 6227 ) 6228 6229 # Docker debug 6230 # if splice_config.get("rm_container"): 6231 # rm_container = "--rm" 6232 # else: 6233 # rm_container = "" 6234 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6235 6236 log.debug(docker_cmd) 6237 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6238 log.debug(res.stdout) 6239 if res.stderr: 6240 log.error(res.stderr) 6241 res.check_returncode() 6242 else: 6243 log.warning(f"Splice tool configuration not found: {config}") 6244 6245 # Update variants 6246 log.info("Annotation - Updating...") 6247 # Test find output vcf 6248 log.debug( 6249 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6250 ) 6251 output_vcf = [] 6252 # Wrong folder to look in 6253 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6254 if ( 6255 files 6256 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6257 ): 6258 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6259 # log.debug(os.listdir(options.get("output_folder"))) 6260 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6261 if not output_vcf: 6262 log.debug( 6263 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6264 ) 6265 else: 6266 # Get new header from annotated vcf 6267 log.debug(f"Initial header: {len(header.infos)} fields") 6268 # Create new header with splice infos 6269 new_vcf = Variants(input=output_vcf[0]) 6270 new_vcf_header = new_vcf.get_header().infos 6271 for keys, infos in new_vcf_header.items(): 6272 if keys not in header.infos.keys(): 6273 header.infos[keys] = infos 6274 log.debug(f"New header: {len(header.infos)} fields") 6275 log.debug(f"Splice tmp output: {output_vcf[0]}") 6276 self.update_from_vcf(output_vcf[0]) 6277 6278 # Remove folder 6279 remove_if_exists(output_folder)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6285 def get_config_default(self, name: str) -> dict: 6286 """ 6287 The function `get_config_default` returns a dictionary containing default configurations for 6288 various calculations and prioritizations. 6289 6290 :param name: The `get_config_default` function returns a dictionary containing default 6291 configurations for different calculations and prioritizations. The `name` parameter is used to 6292 specify which specific configuration to retrieve from the dictionary 6293 :type name: str 6294 :return: The function `get_config_default` returns a dictionary containing default configuration 6295 settings for different calculations and prioritizations. The specific configuration settings are 6296 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6297 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6298 returned. If there is no match, an empty dictionary is returned. 6299 """ 6300 6301 config_default = { 6302 "calculations": { 6303 "variant_chr_pos_alt_ref": { 6304 "type": "sql", 6305 "name": "variant_chr_pos_alt_ref", 6306 "description": "Create a variant ID with chromosome, position, alt and ref", 6307 "available": False, 6308 "output_column_name": "variant_chr_pos_alt_ref", 6309 "output_column_type": "String", 6310 "output_column_description": "variant ID with chromosome, position, alt and ref", 6311 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6312 "operation_info": True, 6313 }, 6314 "VARTYPE": { 6315 "type": "sql", 6316 "name": "VARTYPE", 6317 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6318 "available": True, 6319 "output_column_name": "VARTYPE", 6320 "output_column_type": "String", 6321 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6322 "operation_query": """ 6323 CASE 6324 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6325 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6326 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6327 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6328 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6329 ELSE 'UNDEFINED' 6330 END 6331 """, 6332 "info_fields": ["SVTYPE"], 6333 "operation_info": True, 6334 }, 6335 "snpeff_hgvs": { 6336 "type": "python", 6337 "name": "snpeff_hgvs", 6338 "description": "HGVS nomenclatures from snpEff annotation", 6339 "available": True, 6340 "function_name": "calculation_extract_snpeff_hgvs", 6341 "function_params": ["snpeff_hgvs", "ANN"], 6342 }, 6343 "snpeff_ann_explode": { 6344 "type": "python", 6345 "name": "snpeff_ann_explode", 6346 "description": "Explode snpEff annotations with uniquify values", 6347 "available": True, 6348 "function_name": "calculation_snpeff_ann_explode", 6349 "function_params": [False, "fields", "snpeff_", "ANN"], 6350 }, 6351 "snpeff_ann_explode_uniquify": { 6352 "type": "python", 6353 "name": "snpeff_ann_explode_uniquify", 6354 "description": "Explode snpEff annotations", 6355 "available": True, 6356 "function_name": "calculation_snpeff_ann_explode", 6357 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6358 }, 6359 "snpeff_ann_explode_json": { 6360 "type": "python", 6361 "name": "snpeff_ann_explode_json", 6362 "description": "Explode snpEff annotations in JSON format", 6363 "available": True, 6364 "function_name": "calculation_snpeff_ann_explode", 6365 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6366 }, 6367 "NOMEN": { 6368 "type": "python", 6369 "name": "NOMEN", 6370 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6371 "available": True, 6372 "function_name": "calculation_extract_nomen", 6373 "function_params": [], 6374 }, 6375 "FINDBYPIPELINE": { 6376 "type": "python", 6377 "name": "FINDBYPIPELINE", 6378 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6379 "available": True, 6380 "function_name": "calculation_find_by_pipeline", 6381 "function_params": ["findbypipeline"], 6382 }, 6383 "FINDBYSAMPLE": { 6384 "type": "python", 6385 "name": "FINDBYSAMPLE", 6386 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6387 "available": True, 6388 "function_name": "calculation_find_by_pipeline", 6389 "function_params": ["findbysample"], 6390 }, 6391 "GENOTYPECONCORDANCE": { 6392 "type": "python", 6393 "name": "GENOTYPECONCORDANCE", 6394 "description": "Concordance of genotype for multi caller VCF", 6395 "available": True, 6396 "function_name": "calculation_genotype_concordance", 6397 "function_params": [], 6398 }, 6399 "BARCODE": { 6400 "type": "python", 6401 "name": "BARCODE", 6402 "description": "BARCODE as VaRank tool", 6403 "available": True, 6404 "function_name": "calculation_barcode", 6405 "function_params": [], 6406 }, 6407 "BARCODEFAMILY": { 6408 "type": "python", 6409 "name": "BARCODEFAMILY", 6410 "description": "BARCODEFAMILY as VaRank tool", 6411 "available": True, 6412 "function_name": "calculation_barcode_family", 6413 "function_params": ["BCF"], 6414 }, 6415 "TRIO": { 6416 "type": "python", 6417 "name": "TRIO", 6418 "description": "Inheritance for a trio family", 6419 "available": True, 6420 "function_name": "calculation_trio", 6421 "function_params": [], 6422 }, 6423 "VAF": { 6424 "type": "python", 6425 "name": "VAF", 6426 "description": "Variant Allele Frequency (VAF) harmonization", 6427 "available": True, 6428 "function_name": "calculation_vaf_normalization", 6429 "function_params": [], 6430 }, 6431 "VAF_stats": { 6432 "type": "python", 6433 "name": "VAF_stats", 6434 "description": "Variant Allele Frequency (VAF) statistics", 6435 "available": True, 6436 "function_name": "calculation_genotype_stats", 6437 "function_params": ["VAF"], 6438 }, 6439 "DP_stats": { 6440 "type": "python", 6441 "name": "DP_stats", 6442 "description": "Depth (DP) statistics", 6443 "available": True, 6444 "function_name": "calculation_genotype_stats", 6445 "function_params": ["DP"], 6446 }, 6447 "variant_id": { 6448 "type": "python", 6449 "name": "variant_id", 6450 "description": "Variant ID generated from variant position and type", 6451 "available": True, 6452 "function_name": "calculation_variant_id", 6453 "function_params": [], 6454 }, 6455 "transcripts_json": { 6456 "type": "python", 6457 "name": "transcripts_json", 6458 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6459 "available": True, 6460 "function_name": "calculation_transcripts_annotation", 6461 "function_params": ["transcripts_json", None], 6462 }, 6463 "transcripts_ann": { 6464 "type": "python", 6465 "name": "transcripts_ann", 6466 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6467 "available": True, 6468 "function_name": "calculation_transcripts_annotation", 6469 "function_params": [None, "transcripts_ann"], 6470 }, 6471 "transcripts_annotations": { 6472 "type": "python", 6473 "name": "transcripts_annotations", 6474 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6475 "available": True, 6476 "function_name": "calculation_transcripts_annotation", 6477 "function_params": [None, None], 6478 }, 6479 "transcripts_prioritization": { 6480 "type": "python", 6481 "name": "transcripts_prioritization", 6482 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6483 "available": True, 6484 "function_name": "calculation_transcripts_prioritization", 6485 "function_params": [], 6486 }, 6487 }, 6488 "prioritizations": { 6489 "default": { 6490 "filter": [ 6491 { 6492 "type": "notequals", 6493 "value": "!PASS|\\.", 6494 "score": 0, 6495 "flag": "FILTERED", 6496 "comment": ["Bad variant quality"], 6497 }, 6498 { 6499 "type": "equals", 6500 "value": "REJECT", 6501 "score": -20, 6502 "flag": "PASS", 6503 "comment": ["Bad variant quality"], 6504 }, 6505 ], 6506 "DP": [ 6507 { 6508 "type": "gte", 6509 "value": "50", 6510 "score": 5, 6511 "flag": "PASS", 6512 "comment": ["DP higher than 50"], 6513 } 6514 ], 6515 "ANN": [ 6516 { 6517 "type": "contains", 6518 "value": "HIGH", 6519 "score": 5, 6520 "flag": "PASS", 6521 "comment": [ 6522 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6523 ], 6524 }, 6525 { 6526 "type": "contains", 6527 "value": "MODERATE", 6528 "score": 3, 6529 "flag": "PASS", 6530 "comment": [ 6531 "A non-disruptive variant that might change protein effectiveness" 6532 ], 6533 }, 6534 { 6535 "type": "contains", 6536 "value": "LOW", 6537 "score": 0, 6538 "flag": "FILTERED", 6539 "comment": [ 6540 "Assumed to be mostly harmless or unlikely to change protein behavior" 6541 ], 6542 }, 6543 { 6544 "type": "contains", 6545 "value": "MODIFIER", 6546 "score": 0, 6547 "flag": "FILTERED", 6548 "comment": [ 6549 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6550 ], 6551 }, 6552 ], 6553 } 6554 }, 6555 } 6556 6557 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
6559 def get_config_json( 6560 self, name: str, config_dict: dict = {}, config_file: str = None 6561 ) -> dict: 6562 """ 6563 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6564 default values, a dictionary, and a file. 6565 6566 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6567 the name of the configuration. It is used to identify and retrieve the configuration settings 6568 for a specific component or module 6569 :type name: str 6570 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6571 dictionary that allows you to provide additional configuration settings or overrides. When you 6572 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6573 the key is the configuration setting you want to override or 6574 :type config_dict: dict 6575 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6576 specify the path to a configuration file that contains additional settings. If provided, the 6577 function will read the contents of this file and update the configuration dictionary with the 6578 values found in the file, overriding any existing values with the 6579 :type config_file: str 6580 :return: The function `get_config_json` returns a dictionary containing the configuration 6581 settings. 6582 """ 6583 6584 # Create with default prioritizations 6585 config_default = self.get_config_default(name=name) 6586 configuration = config_default 6587 # log.debug(f"configuration={configuration}") 6588 6589 # Replace prioritizations from dict 6590 for config in config_dict: 6591 configuration[config] = config_dict[config] 6592 6593 # Replace prioritizations from file 6594 config_file = full_path(config_file) 6595 if config_file: 6596 if os.path.exists(config_file): 6597 with open(config_file) as config_file_content: 6598 config_file_dict = json.load(config_file_content) 6599 for config in config_file_dict: 6600 configuration[config] = config_file_dict[config] 6601 else: 6602 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6603 log.error(msg_error) 6604 raise ValueError(msg_error) 6605 6606 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
6608 def prioritization( 6609 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6610 ) -> bool: 6611 """ 6612 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6613 prioritizes variants based on configured profiles and criteria. 6614 6615 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6616 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6617 a table name is provided, the method will prioritize the variants in that specific table 6618 :type table: str 6619 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6620 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6621 provided, the code will use a default prefix value of "PZ" 6622 :type pz_prefix: str 6623 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6624 additional parameters specific to the prioritization process. These parameters can include 6625 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6626 configurations needed for the prioritization of variants in a V 6627 :type pz_param: dict 6628 :return: A boolean value (True) is being returned from the `prioritization` function. 6629 """ 6630 6631 # Config 6632 config = self.get_config() 6633 6634 # Param 6635 param = self.get_param() 6636 6637 # Prioritization param 6638 if pz_param is not None: 6639 prioritization_param = pz_param 6640 else: 6641 prioritization_param = param.get("prioritization", {}) 6642 6643 # Configuration profiles 6644 prioritization_config_file = prioritization_param.get( 6645 "prioritization_config", None 6646 ) 6647 prioritization_config_file = full_path(prioritization_config_file) 6648 prioritizations_config = self.get_config_json( 6649 name="prioritizations", config_file=prioritization_config_file 6650 ) 6651 6652 # Prioritization prefix 6653 pz_prefix_default = "PZ" 6654 if pz_prefix is None: 6655 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6656 6657 # Prioritization options 6658 profiles = prioritization_param.get("profiles", []) 6659 if isinstance(profiles, str): 6660 profiles = profiles.split(",") 6661 pzfields = prioritization_param.get( 6662 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6663 ) 6664 if isinstance(pzfields, str): 6665 pzfields = pzfields.split(",") 6666 default_profile = prioritization_param.get("default_profile", None) 6667 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6668 prioritization_score_mode = prioritization_param.get( 6669 "prioritization_score_mode", "HOWARD" 6670 ) 6671 6672 # Quick Prioritizations 6673 prioritizations = param.get("prioritizations", None) 6674 if prioritizations: 6675 log.info("Quick Prioritization:") 6676 for profile in prioritizations.split(","): 6677 if profile not in profiles: 6678 profiles.append(profile) 6679 log.info(f" {profile}") 6680 6681 # If profile "ALL" provided, all profiles in the config profiles 6682 if "ALL" in profiles: 6683 profiles = list(prioritizations_config.keys()) 6684 6685 for profile in profiles: 6686 if prioritizations_config.get(profile, None): 6687 log.debug(f"Profile '{profile}' configured") 6688 else: 6689 msg_error = f"Profile '{profile}' NOT configured" 6690 log.error(msg_error) 6691 raise ValueError(msg_error) 6692 6693 if profiles: 6694 log.info(f"Prioritization... ") 6695 else: 6696 log.debug(f"No profile defined") 6697 return False 6698 6699 if not default_profile and len(profiles): 6700 default_profile = profiles[0] 6701 6702 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6703 log.debug("Profiles to check: " + str(list(profiles))) 6704 6705 # Variables 6706 if table is not None: 6707 table_variants = table 6708 else: 6709 table_variants = self.get_table_variants(clause="update") 6710 log.debug(f"Table to prioritize: {table_variants}") 6711 6712 # Added columns 6713 added_columns = [] 6714 6715 # Create list of PZfields 6716 # List of PZFields 6717 list_of_pzfields_original = pzfields + [ 6718 pzfield + pzfields_sep + profile 6719 for pzfield in pzfields 6720 for profile in profiles 6721 ] 6722 list_of_pzfields = [] 6723 log.debug(f"{list_of_pzfields_original}") 6724 6725 # Remove existing PZfields to use if exists 6726 for pzfield in list_of_pzfields_original: 6727 if self.get_header().infos.get(pzfield, None) is None: 6728 list_of_pzfields.append(pzfield) 6729 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6730 else: 6731 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6732 6733 if list_of_pzfields: 6734 6735 # Explode Infos prefix 6736 explode_infos_prefix = self.get_explode_infos_prefix() 6737 6738 # PZfields tags description 6739 PZfields_INFOS = { 6740 f"{pz_prefix}Tags": { 6741 "ID": f"{pz_prefix}Tags", 6742 "Number": ".", 6743 "Type": "String", 6744 "Description": "Variant tags based on annotation criteria", 6745 }, 6746 f"{pz_prefix}Score": { 6747 "ID": f"{pz_prefix}Score", 6748 "Number": 1, 6749 "Type": "Integer", 6750 "Description": "Variant score based on annotation criteria", 6751 }, 6752 f"{pz_prefix}Flag": { 6753 "ID": f"{pz_prefix}Flag", 6754 "Number": 1, 6755 "Type": "String", 6756 "Description": "Variant flag based on annotation criteria", 6757 }, 6758 f"{pz_prefix}Comment": { 6759 "ID": f"{pz_prefix}Comment", 6760 "Number": ".", 6761 "Type": "String", 6762 "Description": "Variant comment based on annotation criteria", 6763 }, 6764 f"{pz_prefix}Infos": { 6765 "ID": f"{pz_prefix}Infos", 6766 "Number": ".", 6767 "Type": "String", 6768 "Description": "Variant infos based on annotation criteria", 6769 }, 6770 } 6771 6772 # Create INFO fields if not exist 6773 for field in PZfields_INFOS: 6774 field_ID = PZfields_INFOS[field]["ID"] 6775 field_description = PZfields_INFOS[field]["Description"] 6776 if field_ID not in self.get_header().infos and field_ID in pzfields: 6777 field_description = ( 6778 PZfields_INFOS[field]["Description"] 6779 + f", profile {default_profile}" 6780 ) 6781 self.get_header().infos[field_ID] = vcf.parser._Info( 6782 field_ID, 6783 PZfields_INFOS[field]["Number"], 6784 PZfields_INFOS[field]["Type"], 6785 field_description, 6786 "unknown", 6787 "unknown", 6788 code_type_map[PZfields_INFOS[field]["Type"]], 6789 ) 6790 6791 # Create INFO fields if not exist for each profile 6792 for profile in prioritizations_config: 6793 if profile in profiles or profiles == []: 6794 for field in PZfields_INFOS: 6795 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6796 field_description = ( 6797 PZfields_INFOS[field]["Description"] 6798 + f", profile {profile}" 6799 ) 6800 if ( 6801 field_ID not in self.get_header().infos 6802 and field in pzfields 6803 ): 6804 self.get_header().infos[field_ID] = vcf.parser._Info( 6805 field_ID, 6806 PZfields_INFOS[field]["Number"], 6807 PZfields_INFOS[field]["Type"], 6808 field_description, 6809 "unknown", 6810 "unknown", 6811 code_type_map[PZfields_INFOS[field]["Type"]], 6812 ) 6813 6814 # Header 6815 for pzfield in list_of_pzfields: 6816 if re.match(f"{pz_prefix}Score.*", pzfield): 6817 added_column = self.add_column( 6818 table_name=table_variants, 6819 column_name=pzfield, 6820 column_type="INTEGER", 6821 default_value="0", 6822 ) 6823 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6824 added_column = self.add_column( 6825 table_name=table_variants, 6826 column_name=pzfield, 6827 column_type="BOOLEAN", 6828 default_value="1", 6829 ) 6830 else: 6831 added_column = self.add_column( 6832 table_name=table_variants, 6833 column_name=pzfield, 6834 column_type="STRING", 6835 default_value="''", 6836 ) 6837 added_columns.append(added_column) 6838 6839 # Profiles 6840 if profiles: 6841 6842 # foreach profile in configuration file 6843 for profile in prioritizations_config: 6844 6845 # If profile is asked in param, or ALL are asked (empty profile []) 6846 if profile in profiles or profiles == []: 6847 log.info(f"Profile '{profile}'") 6848 6849 sql_set_info_option = "" 6850 6851 sql_set_info = [] 6852 6853 # PZ fields set 6854 6855 # PZScore 6856 if ( 6857 f"{pz_prefix}Score{pzfields_sep}{profile}" 6858 in list_of_pzfields 6859 ): 6860 sql_set_info.append( 6861 f""" 6862 concat( 6863 '{pz_prefix}Score{pzfields_sep}{profile}=', 6864 {pz_prefix}Score{pzfields_sep}{profile} 6865 ) 6866 """ 6867 ) 6868 if ( 6869 profile == default_profile 6870 and f"{pz_prefix}Score" in list_of_pzfields 6871 ): 6872 sql_set_info.append( 6873 f""" 6874 concat( 6875 '{pz_prefix}Score=', 6876 {pz_prefix}Score{pzfields_sep}{profile} 6877 ) 6878 """ 6879 ) 6880 6881 # PZFlag 6882 if ( 6883 f"{pz_prefix}Flag{pzfields_sep}{profile}" 6884 in list_of_pzfields 6885 ): 6886 sql_set_info.append( 6887 f""" 6888 concat( 6889 '{pz_prefix}Flag{pzfields_sep}{profile}=', 6890 CASE 6891 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6892 THEN 'PASS' 6893 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 6894 THEN 'FILTERED' 6895 END 6896 ) 6897 """ 6898 ) 6899 if ( 6900 profile == default_profile 6901 and f"{pz_prefix}Flag" in list_of_pzfields 6902 ): 6903 sql_set_info.append( 6904 f""" 6905 concat( 6906 '{pz_prefix}Flag=', 6907 CASE 6908 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6909 THEN 'PASS' 6910 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 6911 THEN 'FILTERED' 6912 END 6913 ) 6914 """ 6915 ) 6916 6917 # PZComment 6918 if ( 6919 f"{pz_prefix}Comment{pzfields_sep}{profile}" 6920 in list_of_pzfields 6921 ): 6922 sql_set_info.append( 6923 f""" 6924 CASE 6925 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 6926 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 6927 ELSE '' 6928 END 6929 """ 6930 ) 6931 if ( 6932 profile == default_profile 6933 and f"{pz_prefix}Comment" in list_of_pzfields 6934 ): 6935 sql_set_info.append( 6936 f""" 6937 CASE 6938 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 6939 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 6940 ELSE '' 6941 END 6942 """ 6943 ) 6944 6945 # PZInfos 6946 if ( 6947 f"{pz_prefix}Infos{pzfields_sep}{profile}" 6948 in list_of_pzfields 6949 ): 6950 sql_set_info.append( 6951 f""" 6952 CASE 6953 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 6954 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 6955 ELSE '' 6956 END 6957 """ 6958 ) 6959 if ( 6960 profile == default_profile 6961 and f"{pz_prefix}Infos" in list_of_pzfields 6962 ): 6963 sql_set_info.append( 6964 f""" 6965 CASE 6966 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 6967 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 6968 ELSE '' 6969 END 6970 """ 6971 ) 6972 6973 # Merge PZfields 6974 sql_set_info_option = "" 6975 sql_set_sep = "" 6976 for sql_set in sql_set_info: 6977 if sql_set_sep: 6978 sql_set_info_option += f""" 6979 , concat('{sql_set_sep}', {sql_set}) 6980 """ 6981 else: 6982 sql_set_info_option += f""" 6983 , {sql_set} 6984 """ 6985 sql_set_sep = ";" 6986 6987 sql_queries = [] 6988 for annotation in prioritizations_config[profile]: 6989 6990 # Explode specific annotation 6991 log.debug(f"Explode annotation '{annotation}'") 6992 added_columns += self.explode_infos( 6993 prefix=explode_infos_prefix, 6994 fields=[annotation], 6995 table=table_variants, 6996 ) 6997 extra_infos = self.get_extra_infos(table=table_variants) 6998 6999 # Check if annotation field is present 7000 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 7001 log.debug(f"Annotation '{annotation}' not in data") 7002 continue 7003 else: 7004 log.debug(f"Annotation '{annotation}' in data") 7005 7006 # For each criterions 7007 for criterion in prioritizations_config[profile][ 7008 annotation 7009 ]: 7010 criterion_type = criterion["type"] 7011 criterion_value = criterion["value"] 7012 criterion_score = criterion.get("score", 0) 7013 criterion_flag = criterion.get("flag", "PASS") 7014 criterion_flag_bool = criterion_flag == "PASS" 7015 criterion_comment = ( 7016 ", ".join(criterion.get("comment", [])) 7017 .replace("'", "''") 7018 .replace(";", ",") 7019 .replace("\t", " ") 7020 ) 7021 criterion_infos = ( 7022 str(criterion) 7023 .replace("'", "''") 7024 .replace(";", ",") 7025 .replace("\t", " ") 7026 ) 7027 7028 sql_set = [] 7029 sql_set_info = [] 7030 7031 # PZ fields set 7032 if ( 7033 f"{pz_prefix}Score{pzfields_sep}{profile}" 7034 in list_of_pzfields 7035 ): 7036 if prioritization_score_mode == "HOWARD": 7037 sql_set.append( 7038 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7039 ) 7040 elif prioritization_score_mode == "VaRank": 7041 sql_set.append( 7042 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7043 ) 7044 else: 7045 sql_set.append( 7046 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7047 ) 7048 if ( 7049 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7050 in list_of_pzfields 7051 ): 7052 sql_set.append( 7053 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7054 ) 7055 if ( 7056 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7057 in list_of_pzfields 7058 ): 7059 sql_set.append( 7060 f""" 7061 {pz_prefix}Comment{pzfields_sep}{profile} = 7062 concat( 7063 {pz_prefix}Comment{pzfields_sep}{profile}, 7064 CASE 7065 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7066 THEN ', ' 7067 ELSE '' 7068 END, 7069 '{criterion_comment}' 7070 ) 7071 """ 7072 ) 7073 if ( 7074 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7075 in list_of_pzfields 7076 ): 7077 sql_set.append( 7078 f""" 7079 {pz_prefix}Infos{pzfields_sep}{profile} = 7080 concat( 7081 {pz_prefix}Infos{pzfields_sep}{profile}, 7082 '{criterion_infos}' 7083 ) 7084 """ 7085 ) 7086 sql_set_option = ",".join(sql_set) 7087 7088 # Criterion and comparison 7089 if sql_set_option: 7090 try: 7091 float(criterion_value) 7092 sql_update = f""" 7093 UPDATE {table_variants} 7094 SET {sql_set_option} 7095 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7096 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7097 """ 7098 except: 7099 contains_option = "" 7100 if criterion_type == "contains": 7101 contains_option = ".*" 7102 sql_update = f""" 7103 UPDATE {table_variants} 7104 SET {sql_set_option} 7105 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7106 """ 7107 sql_queries.append(sql_update) 7108 else: 7109 log.warning( 7110 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7111 ) 7112 7113 # PZTags 7114 if ( 7115 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7116 in list_of_pzfields 7117 ): 7118 7119 # Create PZFalgs value 7120 pztags_value = "" 7121 pztags_sep_default = "|" 7122 pztags_sep = "" 7123 for pzfield in pzfields: 7124 if pzfield not in [f"{pz_prefix}Tags"]: 7125 if ( 7126 f"{pzfield}{pzfields_sep}{profile}" 7127 in list_of_pzfields 7128 ): 7129 if pzfield in [f"{pz_prefix}Flag"]: 7130 pztags_value += f"""{pztags_sep}{pzfield}#', 7131 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7132 THEN 'PASS' 7133 ELSE 'FILTERED' 7134 END, '""" 7135 else: 7136 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7137 pztags_sep = pztags_sep_default 7138 7139 # Add Query update for PZFlags 7140 sql_update_pztags = f""" 7141 UPDATE {table_variants} 7142 SET INFO = concat( 7143 INFO, 7144 CASE WHEN INFO NOT in ('','.') 7145 THEN ';' 7146 ELSE '' 7147 END, 7148 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7149 ) 7150 """ 7151 sql_queries.append(sql_update_pztags) 7152 7153 # Add Query update for PZFlags for default 7154 if profile == default_profile: 7155 sql_update_pztags_default = f""" 7156 UPDATE {table_variants} 7157 SET INFO = concat( 7158 INFO, 7159 ';', 7160 '{pz_prefix}Tags={pztags_value}' 7161 ) 7162 """ 7163 sql_queries.append(sql_update_pztags_default) 7164 7165 log.info(f"""Profile '{profile}' - Prioritization... """) 7166 7167 if sql_queries: 7168 7169 for sql_query in sql_queries: 7170 log.debug( 7171 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7172 ) 7173 self.conn.execute(sql_query) 7174 7175 log.info(f"""Profile '{profile}' - Update... """) 7176 sql_query_update = f""" 7177 UPDATE {table_variants} 7178 SET INFO = 7179 concat( 7180 CASE 7181 WHEN INFO NOT IN ('','.') 7182 THEN concat(INFO, ';') 7183 ELSE '' 7184 END 7185 {sql_set_info_option} 7186 ) 7187 """ 7188 self.conn.execute(sql_query_update) 7189 7190 else: 7191 7192 log.warning(f"No profiles in parameters") 7193 7194 # Remove added columns 7195 for added_column in added_columns: 7196 self.drop_column(column=added_column) 7197 7198 # Explode INFOS fields into table fields 7199 if self.get_explode_infos(): 7200 self.explode_infos( 7201 prefix=self.get_explode_infos_prefix(), 7202 fields=self.get_explode_infos_fields(), 7203 force=True, 7204 ) 7205 7206 return True
The prioritization function in Python processes VCF files, adds new INFO fields, and
prioritizes variants based on configured profiles and criteria.
Parameters
- table: The
tableparameter in theprioritizationfunction is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table - pz_prefix: The
pz_prefixparameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ" - pz_param: The
pz_paramparameter in theprioritizationmethod is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns
A boolean value (True) is being returned from the
prioritizationfunction.
7212 def annotation_hgvs(self, threads: int = None) -> None: 7213 """ 7214 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7215 coordinates and alleles. 7216 7217 :param threads: The `threads` parameter is an optional integer that specifies the number of 7218 threads to use for parallel processing. If no value is provided, it will default to the number 7219 of threads obtained from the `get_threads()` method 7220 :type threads: int 7221 """ 7222 7223 # Function for each partition of the Dask Dataframe 7224 def partition_function(partition): 7225 """ 7226 The function `partition_function` applies the `annotation_hgvs_partition` function to 7227 each row of a DataFrame called `partition`. 7228 7229 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7230 to be processed 7231 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7232 the "partition" dataframe along the axis 1. 7233 """ 7234 return partition.apply(annotation_hgvs_partition, axis=1) 7235 7236 def annotation_hgvs_partition(row) -> str: 7237 """ 7238 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7239 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7240 7241 :param row: A dictionary-like object that contains the values for the following keys: 7242 :return: a string that contains the HGVS names associated with the given row of data. 7243 """ 7244 7245 chr = row["CHROM"] 7246 pos = row["POS"] 7247 ref = row["REF"] 7248 alt = row["ALT"] 7249 7250 # Find list of associated transcripts 7251 transcripts_list = list( 7252 polars_conn.execute( 7253 f""" 7254 SELECT transcript 7255 FROM refseq_df 7256 WHERE CHROM='{chr}' 7257 AND POS={pos} 7258 """ 7259 )["transcript"] 7260 ) 7261 7262 # Full HGVS annotation in list 7263 hgvs_full_list = [] 7264 7265 for transcript_name in transcripts_list: 7266 7267 # Transcript 7268 transcript = get_transcript( 7269 transcripts=transcripts, transcript_name=transcript_name 7270 ) 7271 # Exon 7272 if use_exon: 7273 exon = transcript.find_exon_number(pos) 7274 else: 7275 exon = None 7276 # Protein 7277 transcript_protein = None 7278 if use_protein or add_protein or full_format: 7279 transcripts_protein = list( 7280 polars_conn.execute( 7281 f""" 7282 SELECT protein 7283 FROM refseqlink_df 7284 WHERE transcript='{transcript_name}' 7285 LIMIT 1 7286 """ 7287 )["protein"] 7288 ) 7289 if len(transcripts_protein): 7290 transcript_protein = transcripts_protein[0] 7291 7292 # HGVS name 7293 hgvs_name = format_hgvs_name( 7294 chr, 7295 pos, 7296 ref, 7297 alt, 7298 genome=genome, 7299 transcript=transcript, 7300 transcript_protein=transcript_protein, 7301 exon=exon, 7302 use_gene=use_gene, 7303 use_protein=use_protein, 7304 full_format=full_format, 7305 use_version=use_version, 7306 codon_type=codon_type, 7307 ) 7308 hgvs_full_list.append(hgvs_name) 7309 if add_protein and not use_protein and not full_format: 7310 hgvs_name = format_hgvs_name( 7311 chr, 7312 pos, 7313 ref, 7314 alt, 7315 genome=genome, 7316 transcript=transcript, 7317 transcript_protein=transcript_protein, 7318 exon=exon, 7319 use_gene=use_gene, 7320 use_protein=True, 7321 full_format=False, 7322 use_version=use_version, 7323 codon_type=codon_type, 7324 ) 7325 hgvs_full_list.append(hgvs_name) 7326 7327 # Create liste of HGVS annotations 7328 hgvs_full = ",".join(hgvs_full_list) 7329 7330 return hgvs_full 7331 7332 # Polars connexion 7333 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7334 7335 # Config 7336 config = self.get_config() 7337 7338 # Databases 7339 # Genome 7340 databases_genomes_folders = ( 7341 config.get("folders", {}) 7342 .get("databases", {}) 7343 .get("genomes", DEFAULT_GENOME_FOLDER) 7344 ) 7345 databases_genome = ( 7346 config.get("folders", {}).get("databases", {}).get("genomes", "") 7347 ) 7348 # refseq database folder 7349 databases_refseq_folders = ( 7350 config.get("folders", {}) 7351 .get("databases", {}) 7352 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7353 ) 7354 # refseq 7355 databases_refseq = config.get("databases", {}).get("refSeq", None) 7356 # refSeqLink 7357 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7358 7359 # Param 7360 param = self.get_param() 7361 7362 # Quick HGVS 7363 if "hgvs_options" in param and param.get("hgvs_options", ""): 7364 log.info(f"Quick HGVS Annotation:") 7365 if not param.get("hgvs", None): 7366 param["hgvs"] = {} 7367 for option in param.get("hgvs_options", "").split(","): 7368 option_var_val = option.split("=") 7369 option_var = option_var_val[0] 7370 if len(option_var_val) > 1: 7371 option_val = option_var_val[1] 7372 else: 7373 option_val = "True" 7374 if option_val.upper() in ["TRUE"]: 7375 option_val = True 7376 elif option_val.upper() in ["FALSE"]: 7377 option_val = False 7378 log.info(f" {option_var}={option_val}") 7379 param["hgvs"][option_var] = option_val 7380 7381 # Check if HGVS annotation enabled 7382 if "hgvs" in param: 7383 log.info(f"HGVS Annotation... ") 7384 for hgvs_option in param.get("hgvs", {}): 7385 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7386 else: 7387 return 7388 7389 # HGVS Param 7390 param_hgvs = param.get("hgvs", {}) 7391 use_exon = param_hgvs.get("use_exon", False) 7392 use_gene = param_hgvs.get("use_gene", False) 7393 use_protein = param_hgvs.get("use_protein", False) 7394 add_protein = param_hgvs.get("add_protein", False) 7395 full_format = param_hgvs.get("full_format", False) 7396 use_version = param_hgvs.get("use_version", False) 7397 codon_type = param_hgvs.get("codon_type", "3") 7398 7399 # refSseq refSeqLink 7400 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7401 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7402 7403 # Assembly 7404 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7405 7406 # Genome 7407 genome_file = None 7408 if find_genome(databases_genome): 7409 genome_file = find_genome(databases_genome) 7410 else: 7411 genome_file = find_genome( 7412 genome_path=databases_genomes_folders, assembly=assembly 7413 ) 7414 log.debug("Genome: " + str(genome_file)) 7415 7416 # refSseq 7417 refseq_file = find_file_prefix( 7418 input_file=databases_refseq, 7419 prefix="ncbiRefSeq", 7420 folder=databases_refseq_folders, 7421 assembly=assembly, 7422 ) 7423 log.debug("refSeq: " + str(refseq_file)) 7424 7425 # refSeqLink 7426 refseqlink_file = find_file_prefix( 7427 input_file=databases_refseqlink, 7428 prefix="ncbiRefSeqLink", 7429 folder=databases_refseq_folders, 7430 assembly=assembly, 7431 ) 7432 log.debug("refSeqLink: " + str(refseqlink_file)) 7433 7434 # Threads 7435 if not threads: 7436 threads = self.get_threads() 7437 log.debug("Threads: " + str(threads)) 7438 7439 # Variables 7440 table_variants = self.get_table_variants(clause="update") 7441 7442 # Get variants SNV and InDel only 7443 query_variants = f""" 7444 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7445 FROM {table_variants} 7446 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7447 """ 7448 df_variants = self.get_query_to_df(query_variants) 7449 7450 # Added columns 7451 added_columns = [] 7452 7453 # Add hgvs column in variants table 7454 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7455 added_column = self.add_column( 7456 table_variants, hgvs_column_name, "STRING", default_value=None 7457 ) 7458 added_columns.append(added_column) 7459 7460 log.debug(f"refSeq loading...") 7461 # refSeq in duckDB 7462 refseq_table = get_refseq_table( 7463 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7464 ) 7465 # Loading all refSeq in Dataframe 7466 refseq_query = f""" 7467 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7468 FROM {refseq_table} 7469 JOIN df_variants ON ( 7470 {refseq_table}.chrom = df_variants.CHROM 7471 AND {refseq_table}.txStart<=df_variants.POS 7472 AND {refseq_table}.txEnd>=df_variants.POS 7473 ) 7474 """ 7475 refseq_df = self.conn.query(refseq_query).pl() 7476 7477 if refseqlink_file: 7478 log.debug(f"refSeqLink loading...") 7479 # refSeqLink in duckDB 7480 refseqlink_table = get_refseq_table( 7481 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7482 ) 7483 # Loading all refSeqLink in Dataframe 7484 protacc_column = "protAcc_with_ver" 7485 mrnaacc_column = "mrnaAcc_with_ver" 7486 refseqlink_query = f""" 7487 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7488 FROM {refseqlink_table} 7489 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7490 WHERE protAcc_without_ver IS NOT NULL 7491 """ 7492 # Polars Dataframe 7493 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7494 7495 # Read RefSeq transcripts into a python dict/model. 7496 log.debug(f"Transcripts loading...") 7497 with tempfile.TemporaryDirectory() as tmpdir: 7498 transcripts_query = f""" 7499 COPY ( 7500 SELECT {refseq_table}.* 7501 FROM {refseq_table} 7502 JOIN df_variants ON ( 7503 {refseq_table}.chrom=df_variants.CHROM 7504 AND {refseq_table}.txStart<=df_variants.POS 7505 AND {refseq_table}.txEnd>=df_variants.POS 7506 ) 7507 ) 7508 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7509 """ 7510 self.conn.query(transcripts_query) 7511 with open(f"{tmpdir}/transcript.tsv") as infile: 7512 transcripts = read_transcripts(infile) 7513 7514 # Polars connexion 7515 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7516 7517 log.debug("Genome loading...") 7518 # Read genome sequence using pyfaidx. 7519 genome = Fasta(genome_file) 7520 7521 log.debug("Start annotation HGVS...") 7522 7523 # Create 7524 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7525 ddf = dd.from_pandas(df_variants, npartitions=threads) 7526 7527 # Use dask.dataframe.apply() to apply function on each partition 7528 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7529 7530 # Convert Dask DataFrame to Pandas Dataframe 7531 df = ddf.compute() 7532 7533 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7534 with tempfile.TemporaryDirectory() as tmpdir: 7535 df_parquet = os.path.join(tmpdir, "df.parquet") 7536 df.to_parquet(df_parquet) 7537 7538 # Update hgvs column 7539 update_variant_query = f""" 7540 UPDATE {table_variants} 7541 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7542 FROM read_parquet('{df_parquet}') as df 7543 WHERE variants."#CHROM" = df.CHROM 7544 AND variants.POS = df.POS 7545 AND variants.REF = df.REF 7546 AND variants.ALT = df.ALT 7547 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7548 """ 7549 self.execute_query(update_variant_query) 7550 7551 # Update INFO column 7552 sql_query_update = f""" 7553 UPDATE {table_variants} 7554 SET INFO = 7555 concat( 7556 CASE 7557 WHEN INFO NOT IN ('','.') 7558 THEN concat(INFO, ';') 7559 ELSE '' 7560 END, 7561 'hgvs=', 7562 {hgvs_column_name} 7563 ) 7564 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7565 """ 7566 self.execute_query(sql_query_update) 7567 7568 # Add header 7569 HGVS_INFOS = { 7570 "hgvs": { 7571 "ID": "hgvs", 7572 "Number": ".", 7573 "Type": "String", 7574 "Description": f"HGVS annotatation with HOWARD", 7575 } 7576 } 7577 7578 for field in HGVS_INFOS: 7579 field_ID = HGVS_INFOS[field]["ID"] 7580 field_description = HGVS_INFOS[field]["Description"] 7581 self.get_header().infos[field_ID] = vcf.parser._Info( 7582 field_ID, 7583 HGVS_INFOS[field]["Number"], 7584 HGVS_INFOS[field]["Type"], 7585 field_description, 7586 "unknown", 7587 "unknown", 7588 code_type_map[HGVS_INFOS[field]["Type"]], 7589 ) 7590 7591 # Remove added columns 7592 for added_column in added_columns: 7593 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
7599 def get_operations_help( 7600 self, operations_config_dict: dict = {}, operations_config_file: str = None 7601 ) -> list: 7602 7603 # Init 7604 operations_help = [] 7605 7606 # operations 7607 operations = self.get_config_json( 7608 name="calculations", 7609 config_dict=operations_config_dict, 7610 config_file=operations_config_file, 7611 ) 7612 for op in operations: 7613 op_name = operations[op].get("name", op).upper() 7614 op_description = operations[op].get("description", op_name) 7615 op_available = operations[op].get("available", False) 7616 if op_available: 7617 operations_help.append(f" {op_name}: {op_description}") 7618 7619 # Sort operations 7620 operations_help.sort() 7621 7622 # insert header 7623 operations_help.insert(0, "Available calculation operations:") 7624 7625 # Return 7626 return operations_help
7628 def calculation( 7629 self, 7630 operations: dict = {}, 7631 operations_config_dict: dict = {}, 7632 operations_config_file: str = None, 7633 ) -> None: 7634 """ 7635 It takes a list of operations, and for each operation, it checks if it's a python or sql 7636 operation, and then calls the appropriate function 7637 7638 param json example: 7639 "calculation": { 7640 "NOMEN": { 7641 "options": { 7642 "hgvs_field": "hgvs" 7643 }, 7644 "middle" : null 7645 } 7646 """ 7647 7648 # Param 7649 param = self.get_param() 7650 7651 # operations config 7652 operations_config = self.get_config_json( 7653 name="calculations", 7654 config_dict=operations_config_dict, 7655 config_file=operations_config_file, 7656 ) 7657 7658 # Upper keys 7659 operations_config = {k.upper(): v for k, v in operations_config.items()} 7660 7661 # Calculations 7662 7663 # Operations from param 7664 operations = param.get("calculation", {}).get("calculations", operations) 7665 7666 # Quick calculation - add 7667 if param.get("calculations", None): 7668 calculations_list = [ 7669 value for value in param.get("calculations", "").split(",") 7670 ] 7671 log.info(f"Quick Calculations:") 7672 for calculation_key in calculations_list: 7673 log.info(f" {calculation_key}") 7674 for calculation_operation in calculations_list: 7675 if calculation_operation.upper() not in operations: 7676 operations[calculation_operation.upper()] = {} 7677 add_value_into_dict( 7678 dict_tree=param, 7679 sections=[ 7680 "calculation", 7681 "calculations", 7682 calculation_operation.upper(), 7683 ], 7684 value={}, 7685 ) 7686 7687 # Operations for calculation 7688 if not operations: 7689 operations = param.get("calculation", {}).get("calculations", {}) 7690 7691 if operations: 7692 log.info(f"Calculations...") 7693 7694 # For each operations 7695 for operation_name in operations: 7696 operation_name = operation_name.upper() 7697 if operation_name not in [""]: 7698 if operation_name in operations_config: 7699 log.info(f"Calculation '{operation_name}'") 7700 operation = operations_config[operation_name] 7701 operation_type = operation.get("type", "sql") 7702 if operation_type == "python": 7703 self.calculation_process_function( 7704 operation=operation, operation_name=operation_name 7705 ) 7706 elif operation_type == "sql": 7707 self.calculation_process_sql( 7708 operation=operation, operation_name=operation_name 7709 ) 7710 else: 7711 log.error( 7712 f"Operations config: Type '{operation_type}' NOT available" 7713 ) 7714 raise ValueError( 7715 f"Operations config: Type '{operation_type}' NOT available" 7716 ) 7717 else: 7718 log.error( 7719 f"Operations config: Calculation '{operation_name}' NOT available" 7720 ) 7721 raise ValueError( 7722 f"Operations config: Calculation '{operation_name}' NOT available" 7723 ) 7724 7725 # Explode INFOS fields into table fields 7726 if self.get_explode_infos(): 7727 self.explode_infos( 7728 prefix=self.get_explode_infos_prefix(), 7729 fields=self.get_explode_infos_fields(), 7730 force=True, 7731 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
7733 def calculation_process_sql( 7734 self, operation: dict, operation_name: str = "unknown" 7735 ) -> None: 7736 """ 7737 The `calculation_process_sql` function takes in a mathematical operation as a string and 7738 performs the operation, updating the specified table with the result. 7739 7740 :param operation: The `operation` parameter is a dictionary that contains information about the 7741 mathematical operation to be performed. It includes the following keys: 7742 :type operation: dict 7743 :param operation_name: The `operation_name` parameter is a string that represents the name of 7744 the mathematical operation being performed. It is used for logging and error handling purposes, 7745 defaults to unknown 7746 :type operation_name: str (optional) 7747 """ 7748 7749 # table variants 7750 table_variants = self.get_table_variants(clause="alter") 7751 7752 # Operation infos 7753 operation_name = operation.get("name", "unknown") 7754 log.debug(f"process sql {operation_name}") 7755 output_column_name = operation.get("output_column_name", operation_name) 7756 output_column_type = operation.get("output_column_type", "String") 7757 prefix = operation.get("explode_infos_prefix", "") 7758 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7759 output_column_description = operation.get( 7760 "output_column_description", f"{operation_name} operation" 7761 ) 7762 operation_query = operation.get("operation_query", None) 7763 if isinstance(operation_query, list): 7764 operation_query = " ".join(operation_query) 7765 operation_info_fields = operation.get("info_fields", []) 7766 operation_info_fields_check = operation.get("info_fields_check", False) 7767 operation_info = operation.get("operation_info", True) 7768 7769 if operation_query: 7770 7771 # Info fields check 7772 operation_info_fields_check_result = True 7773 if operation_info_fields_check: 7774 header_infos = self.get_header().infos 7775 for info_field in operation_info_fields: 7776 operation_info_fields_check_result = ( 7777 operation_info_fields_check_result 7778 and info_field in header_infos 7779 ) 7780 7781 # If info fields available 7782 if operation_info_fields_check_result: 7783 7784 # Added_columns 7785 added_columns = [] 7786 7787 # Create VCF header field 7788 vcf_reader = self.get_header() 7789 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7790 output_column_name, 7791 ".", 7792 output_column_type, 7793 output_column_description, 7794 "howard calculation", 7795 "0", 7796 self.code_type_map.get(output_column_type), 7797 ) 7798 7799 # Explode infos if needed 7800 log.debug(f"calculation_process_sql prefix {prefix}") 7801 added_columns += self.explode_infos( 7802 prefix=prefix, 7803 fields=[output_column_name] + operation_info_fields, 7804 force=True, 7805 ) 7806 7807 # Create column 7808 added_column = self.add_column( 7809 table_name=table_variants, 7810 column_name=prefix + output_column_name, 7811 column_type=output_column_type_sql, 7812 default_value="null", 7813 ) 7814 added_columns.append(added_column) 7815 7816 # Operation calculation 7817 try: 7818 7819 # Query to update calculation column 7820 sql_update = f""" 7821 UPDATE {table_variants} 7822 SET "{prefix}{output_column_name}" = ({operation_query}) 7823 """ 7824 self.conn.execute(sql_update) 7825 7826 # Add to INFO 7827 if operation_info: 7828 sql_update_info = f""" 7829 UPDATE {table_variants} 7830 SET "INFO" = 7831 concat( 7832 CASE 7833 WHEN "INFO" IS NOT NULL 7834 THEN concat("INFO", ';') 7835 ELSE '' 7836 END, 7837 '{output_column_name}=', 7838 "{prefix}{output_column_name}" 7839 ) 7840 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7841 """ 7842 self.conn.execute(sql_update_info) 7843 7844 except: 7845 log.error( 7846 f"Operations config: Calculation '{operation_name}' query failed" 7847 ) 7848 raise ValueError( 7849 f"Operations config: Calculation '{operation_name}' query failed" 7850 ) 7851 7852 # Remove added columns 7853 for added_column in added_columns: 7854 log.debug(f"added_column: {added_column}") 7855 self.drop_column(column=added_column) 7856 7857 else: 7858 log.error( 7859 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7860 ) 7861 raise ValueError( 7862 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7863 ) 7864 7865 else: 7866 log.error( 7867 f"Operations config: Calculation '{operation_name}' query NOT defined" 7868 ) 7869 raise ValueError( 7870 f"Operations config: Calculation '{operation_name}' query NOT defined" 7871 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
7873 def calculation_process_function( 7874 self, operation: dict, operation_name: str = "unknown" 7875 ) -> None: 7876 """ 7877 The `calculation_process_function` takes in an operation dictionary and performs the specified 7878 function with the given parameters. 7879 7880 :param operation: The `operation` parameter is a dictionary that contains information about the 7881 operation to be performed. It has the following keys: 7882 :type operation: dict 7883 :param operation_name: The `operation_name` parameter is a string that represents the name of 7884 the operation being performed. It is used for logging purposes, defaults to unknown 7885 :type operation_name: str (optional) 7886 """ 7887 7888 operation_name = operation["name"] 7889 log.debug(f"process sql {operation_name}") 7890 function_name = operation["function_name"] 7891 function_params = operation["function_params"] 7892 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
7894 def calculation_variant_id(self) -> None: 7895 """ 7896 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7897 updates the INFO field of a variants table with the variant ID. 7898 """ 7899 7900 # variant_id annotation field 7901 variant_id_tag = self.get_variant_id_column() 7902 added_columns = [variant_id_tag] 7903 7904 # variant_id hgvs tags" 7905 vcf_infos_tags = { 7906 variant_id_tag: "howard variant ID annotation", 7907 } 7908 7909 # Variants table 7910 table_variants = self.get_table_variants() 7911 7912 # Header 7913 vcf_reader = self.get_header() 7914 7915 # Add variant_id to header 7916 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7917 variant_id_tag, 7918 ".", 7919 "String", 7920 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7921 "howard calculation", 7922 "0", 7923 self.code_type_map.get("String"), 7924 ) 7925 7926 # Update 7927 sql_update = f""" 7928 UPDATE {table_variants} 7929 SET "INFO" = 7930 concat( 7931 CASE 7932 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7933 THEN '' 7934 ELSE concat("INFO", ';') 7935 END, 7936 '{variant_id_tag}=', 7937 "{variant_id_tag}" 7938 ) 7939 """ 7940 self.conn.execute(sql_update) 7941 7942 # Remove added columns 7943 for added_column in added_columns: 7944 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
7946 def calculation_extract_snpeff_hgvs( 7947 self, 7948 snpeff_hgvs: str = "snpeff_hgvs", 7949 snpeff_field: str = "ANN", 7950 ) -> None: 7951 """ 7952 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7953 annotation field in a VCF file and adds them as a new column in the variants table. 7954 7955 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 7956 function is used to specify the name of the column that will store the HGVS nomenclatures 7957 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 7958 snpeff_hgvs 7959 :type snpeff_hgvs: str (optional) 7960 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 7961 function represents the field in the VCF file that contains SnpEff annotations. This field is 7962 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 7963 to ANN 7964 :type snpeff_field: str (optional) 7965 """ 7966 7967 # Snpeff hgvs tags 7968 vcf_infos_tags = { 7969 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7970 } 7971 7972 # Prefix 7973 prefix = self.get_explode_infos_prefix() 7974 if prefix: 7975 prefix = "INFO/" 7976 7977 # snpEff fields 7978 speff_ann_infos = prefix + snpeff_field 7979 speff_hgvs_infos = prefix + snpeff_hgvs 7980 7981 # Variants table 7982 table_variants = self.get_table_variants() 7983 7984 # Header 7985 vcf_reader = self.get_header() 7986 7987 # Add columns 7988 added_columns = [] 7989 7990 # Explode HGVS field in column 7991 added_columns += self.explode_infos(fields=[snpeff_field]) 7992 7993 if snpeff_field in vcf_reader.infos: 7994 7995 log.debug(vcf_reader.infos[snpeff_field]) 7996 7997 # Extract ANN header 7998 ann_description = vcf_reader.infos[snpeff_field].desc 7999 pattern = r"'(.+?)'" 8000 match = re.search(pattern, ann_description) 8001 if match: 8002 ann_header_match = match.group(1).split(" | ") 8003 ann_header_desc = {} 8004 for i in range(len(ann_header_match)): 8005 ann_header_info = "".join( 8006 char for char in ann_header_match[i] if char.isalnum() 8007 ) 8008 ann_header_desc[ann_header_info] = ann_header_match[i] 8009 if not ann_header_desc: 8010 raise ValueError("Invalid header description format") 8011 else: 8012 raise ValueError("Invalid header description format") 8013 8014 # Create variant id 8015 variant_id_column = self.get_variant_id_column() 8016 added_columns += [variant_id_column] 8017 8018 # Create dataframe 8019 dataframe_snpeff_hgvs = self.get_query_to_df( 8020 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8021 ) 8022 8023 # Create main NOMEN column 8024 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8025 speff_ann_infos 8026 ].apply( 8027 lambda x: extract_snpeff_hgvs( 8028 str(x), header=list(ann_header_desc.values()) 8029 ) 8030 ) 8031 8032 # Add snpeff_hgvs to header 8033 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8034 snpeff_hgvs, 8035 ".", 8036 "String", 8037 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8038 "howard calculation", 8039 "0", 8040 self.code_type_map.get("String"), 8041 ) 8042 8043 # Update 8044 sql_update = f""" 8045 UPDATE variants 8046 SET "INFO" = 8047 concat( 8048 CASE 8049 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8050 THEN '' 8051 ELSE concat("INFO", ';') 8052 END, 8053 CASE 8054 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8055 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8056 THEN concat( 8057 '{snpeff_hgvs}=', 8058 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8059 ) 8060 ELSE '' 8061 END 8062 ) 8063 FROM dataframe_snpeff_hgvs 8064 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8065 8066 """ 8067 self.conn.execute(sql_update) 8068 8069 # Delete dataframe 8070 del dataframe_snpeff_hgvs 8071 gc.collect() 8072 8073 else: 8074 8075 log.warning( 8076 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8077 ) 8078 8079 # Remove added columns 8080 for added_column in added_columns: 8081 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
Parameters
- snpeff_hgvs: The
snpeff_hgvsparameter in thecalculation_extract_snpeff_hgvsfunction is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs - snpeff_field: The
snpeff_fieldparameter in thecalculation_extract_snpeff_hgvsfunction represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
8083 def calculation_snpeff_ann_explode( 8084 self, 8085 uniquify: bool = True, 8086 output_format: str = "fields", 8087 output_prefix: str = "snpeff_", 8088 snpeff_field: str = "ANN", 8089 ) -> None: 8090 """ 8091 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8092 exploding the HGVS field and updating variant information accordingly. 8093 8094 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8095 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8096 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8097 defaults to True 8098 :type uniquify: bool (optional) 8099 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8100 function specifies the format in which the output annotations will be generated. It has a 8101 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8102 format, defaults to fields 8103 :type output_format: str (optional) 8104 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8105 method is used to specify the prefix that will be added to the output annotations generated 8106 during the calculation process. This prefix helps to differentiate the newly added annotations 8107 from existing ones in the output data. By default, the, defaults to ANN_ 8108 :type output_prefix: str (optional) 8109 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8110 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8111 field will be processed to explode the HGVS annotations and update the variant information 8112 accordingly, defaults to ANN 8113 :type snpeff_field: str (optional) 8114 """ 8115 8116 # SnpEff annotation field 8117 snpeff_hgvs = "snpeff_ann_explode" 8118 8119 # Snpeff hgvs tags 8120 vcf_infos_tags = { 8121 snpeff_hgvs: "Explode snpEff annotations", 8122 } 8123 8124 # Prefix 8125 prefix = self.get_explode_infos_prefix() 8126 if prefix: 8127 prefix = "INFO/" 8128 8129 # snpEff fields 8130 speff_ann_infos = prefix + snpeff_field 8131 speff_hgvs_infos = prefix + snpeff_hgvs 8132 8133 # Variants table 8134 table_variants = self.get_table_variants() 8135 8136 # Header 8137 vcf_reader = self.get_header() 8138 8139 # Add columns 8140 added_columns = [] 8141 8142 # Explode HGVS field in column 8143 added_columns += self.explode_infos(fields=[snpeff_field]) 8144 log.debug(f"snpeff_field={snpeff_field}") 8145 log.debug(f"added_columns={added_columns}") 8146 8147 if snpeff_field in vcf_reader.infos: 8148 8149 # Extract ANN header 8150 ann_description = vcf_reader.infos[snpeff_field].desc 8151 pattern = r"'(.+?)'" 8152 match = re.search(pattern, ann_description) 8153 if match: 8154 ann_header_match = match.group(1).split(" | ") 8155 ann_header = [] 8156 ann_header_desc = {} 8157 for i in range(len(ann_header_match)): 8158 ann_header_info = "".join( 8159 char for char in ann_header_match[i] if char.isalnum() 8160 ) 8161 ann_header.append(ann_header_info) 8162 ann_header_desc[ann_header_info] = ann_header_match[i] 8163 if not ann_header_desc: 8164 raise ValueError("Invalid header description format") 8165 else: 8166 raise ValueError("Invalid header description format") 8167 8168 # Create variant id 8169 variant_id_column = self.get_variant_id_column() 8170 added_columns += [variant_id_column] 8171 8172 # Create dataframe 8173 dataframe_snpeff_hgvs = self.get_query_to_df( 8174 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8175 ) 8176 8177 # Create snpEff columns 8178 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8179 speff_ann_infos 8180 ].apply( 8181 lambda x: explode_snpeff_ann( 8182 str(x), 8183 uniquify=uniquify, 8184 output_format=output_format, 8185 prefix=output_prefix, 8186 header=list(ann_header_desc.values()), 8187 ) 8188 ) 8189 8190 # Header 8191 ann_annotations_prefix = "" 8192 if output_format.upper() in ["JSON"]: 8193 ann_annotations_prefix = f"{output_prefix}=" 8194 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8195 output_prefix, 8196 ".", 8197 "String", 8198 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8199 + " - JSON format", 8200 "howard calculation", 8201 "0", 8202 self.code_type_map.get("String"), 8203 ) 8204 else: 8205 for ann_annotation in ann_header: 8206 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8207 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8208 ann_annotation_id, 8209 ".", 8210 "String", 8211 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8212 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8213 "howard calculation", 8214 "0", 8215 self.code_type_map.get("String"), 8216 ) 8217 8218 # Update 8219 sql_update = f""" 8220 UPDATE variants 8221 SET "INFO" = 8222 concat( 8223 CASE 8224 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8225 THEN '' 8226 ELSE concat("INFO", ';') 8227 END, 8228 CASE 8229 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8230 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8231 THEN concat( 8232 '{ann_annotations_prefix}', 8233 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8234 ) 8235 ELSE '' 8236 END 8237 ) 8238 FROM dataframe_snpeff_hgvs 8239 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8240 8241 """ 8242 self.conn.execute(sql_update) 8243 8244 # Delete dataframe 8245 del dataframe_snpeff_hgvs 8246 gc.collect() 8247 8248 else: 8249 8250 log.warning( 8251 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8252 ) 8253 8254 # Remove added columns 8255 for added_column in added_columns: 8256 self.drop_column(column=added_column)
The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by
exploding the HGVS field and updating variant information accordingly.
Parameters
- uniquify: The
uniquifyparameter in thecalculation_snpeff_ann_explodemethod is a boolean flag that determines whether the output should be uniquified or not. When set toTrue, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True - output_format: The
output_formatparameter in thecalculation_snpeff_ann_explodefunction specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields - output_prefix: The
output_prefixparameter in thecalculation_snpeff_ann_explodemethod is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_ - snpeff_field: The
snpeff_fieldparameter in thecalculation_snpeff_ann_explodefunction is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
8258 def calculation_extract_nomen(self) -> None: 8259 """ 8260 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8261 """ 8262 8263 # NOMEN field 8264 field_nomen_dict = "NOMEN_DICT" 8265 8266 # NOMEN structure 8267 nomen_dict = { 8268 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8269 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8270 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8271 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8272 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8273 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8274 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8275 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8276 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8277 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8278 } 8279 8280 # Param 8281 param = self.get_param() 8282 8283 # Prefix 8284 prefix = self.get_explode_infos_prefix() 8285 8286 # Header 8287 vcf_reader = self.get_header() 8288 8289 # Get HGVS field 8290 hgvs_field = ( 8291 param.get("calculation", {}) 8292 .get("calculations", {}) 8293 .get("NOMEN", {}) 8294 .get("options", {}) 8295 .get("hgvs_field", "hgvs") 8296 ) 8297 8298 # Get transcripts 8299 transcripts_file = ( 8300 param.get("calculation", {}) 8301 .get("calculations", {}) 8302 .get("NOMEN", {}) 8303 .get("options", {}) 8304 .get("transcripts", None) 8305 ) 8306 transcripts_file = full_path(transcripts_file) 8307 transcripts = [] 8308 if transcripts_file: 8309 if os.path.exists(transcripts_file): 8310 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8311 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8312 else: 8313 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8314 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8315 8316 # Added columns 8317 added_columns = [] 8318 8319 # Explode HGVS field in column 8320 added_columns += self.explode_infos(fields=[hgvs_field]) 8321 8322 # extra infos 8323 extra_infos = self.get_extra_infos() 8324 extra_field = prefix + hgvs_field 8325 8326 if extra_field in extra_infos: 8327 8328 # Create dataframe 8329 dataframe_hgvs = self.get_query_to_df( 8330 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8331 ) 8332 8333 # Create main NOMEN column 8334 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8335 lambda x: find_nomen(str(x), transcripts=transcripts) 8336 ) 8337 8338 # Explode NOMEN Structure and create SQL set for update 8339 sql_nomen_fields = [] 8340 for nomen_field in nomen_dict: 8341 8342 # Explode each field into a column 8343 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8344 lambda x: dict(x).get(nomen_field, "") 8345 ) 8346 8347 # Create VCF header field 8348 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8349 nomen_field, 8350 ".", 8351 "String", 8352 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8353 "howard calculation", 8354 "0", 8355 self.code_type_map.get("String"), 8356 ) 8357 sql_nomen_fields.append( 8358 f""" 8359 CASE 8360 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8361 THEN concat( 8362 ';{nomen_field}=', 8363 dataframe_hgvs."{nomen_field}" 8364 ) 8365 ELSE '' 8366 END 8367 """ 8368 ) 8369 8370 # SQL set for update 8371 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8372 8373 # Update 8374 sql_update = f""" 8375 UPDATE variants 8376 SET "INFO" = 8377 concat( 8378 CASE 8379 WHEN "INFO" IS NULL 8380 THEN '' 8381 ELSE "INFO" 8382 END, 8383 {sql_nomen_fields_set} 8384 ) 8385 FROM dataframe_hgvs 8386 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8387 AND variants."POS" = dataframe_hgvs."POS" 8388 AND variants."REF" = dataframe_hgvs."REF" 8389 AND variants."ALT" = dataframe_hgvs."ALT" 8390 """ 8391 self.conn.execute(sql_update) 8392 8393 # Delete dataframe 8394 del dataframe_hgvs 8395 gc.collect() 8396 8397 # Remove added columns 8398 for added_column in added_columns: 8399 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8401 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8402 """ 8403 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8404 pipeline/sample for a variant and updates the variant information in a VCF file. 8405 8406 :param tag: The `tag` parameter is a string that represents the annotation field for the 8407 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8408 VCF header and to update the corresponding field in the variants table, defaults to 8409 findbypipeline 8410 :type tag: str (optional) 8411 """ 8412 8413 # if FORMAT and samples 8414 if ( 8415 "FORMAT" in self.get_header_columns_as_list() 8416 and self.get_header_sample_list() 8417 ): 8418 8419 # findbypipeline annotation field 8420 findbypipeline_tag = tag 8421 8422 # VCF infos tags 8423 vcf_infos_tags = { 8424 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8425 } 8426 8427 # Prefix 8428 prefix = self.get_explode_infos_prefix() 8429 8430 # Field 8431 findbypipeline_infos = prefix + findbypipeline_tag 8432 8433 # Variants table 8434 table_variants = self.get_table_variants() 8435 8436 # Header 8437 vcf_reader = self.get_header() 8438 8439 # Create variant id 8440 variant_id_column = self.get_variant_id_column() 8441 added_columns = [variant_id_column] 8442 8443 # variant_id, FORMAT and samples 8444 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8445 self.get_header_sample_list() 8446 ) 8447 8448 # Create dataframe 8449 dataframe_findbypipeline = self.get_query_to_df( 8450 f""" SELECT {samples_fields} FROM {table_variants} """ 8451 ) 8452 8453 # Create findbypipeline column 8454 dataframe_findbypipeline[findbypipeline_infos] = ( 8455 dataframe_findbypipeline.apply( 8456 lambda row: findbypipeline( 8457 row, samples=self.get_header_sample_list() 8458 ), 8459 axis=1, 8460 ) 8461 ) 8462 8463 # Add snpeff_hgvs to header 8464 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8465 findbypipeline_tag, 8466 ".", 8467 "String", 8468 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8469 "howard calculation", 8470 "0", 8471 self.code_type_map.get("String"), 8472 ) 8473 8474 # Update 8475 sql_update = f""" 8476 UPDATE variants 8477 SET "INFO" = 8478 concat( 8479 CASE 8480 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8481 THEN '' 8482 ELSE concat("INFO", ';') 8483 END, 8484 CASE 8485 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8486 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8487 THEN concat( 8488 '{findbypipeline_tag}=', 8489 dataframe_findbypipeline."{findbypipeline_infos}" 8490 ) 8491 ELSE '' 8492 END 8493 ) 8494 FROM dataframe_findbypipeline 8495 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8496 """ 8497 self.conn.execute(sql_update) 8498 8499 # Remove added columns 8500 for added_column in added_columns: 8501 self.drop_column(column=added_column) 8502 8503 # Delete dataframe 8504 del dataframe_findbypipeline 8505 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
8507 def calculation_genotype_concordance(self) -> None: 8508 """ 8509 The function `calculation_genotype_concordance` calculates the genotype concordance for 8510 multi-caller VCF files and updates the variant information in the database. 8511 """ 8512 8513 # if FORMAT and samples 8514 if ( 8515 "FORMAT" in self.get_header_columns_as_list() 8516 and self.get_header_sample_list() 8517 ): 8518 8519 # genotypeconcordance annotation field 8520 genotypeconcordance_tag = "genotypeconcordance" 8521 8522 # VCF infos tags 8523 vcf_infos_tags = { 8524 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8525 } 8526 8527 # Prefix 8528 prefix = self.get_explode_infos_prefix() 8529 8530 # Field 8531 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8532 8533 # Variants table 8534 table_variants = self.get_table_variants() 8535 8536 # Header 8537 vcf_reader = self.get_header() 8538 8539 # Create variant id 8540 variant_id_column = self.get_variant_id_column() 8541 added_columns = [variant_id_column] 8542 8543 # variant_id, FORMAT and samples 8544 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8545 self.get_header_sample_list() 8546 ) 8547 8548 # Create dataframe 8549 dataframe_genotypeconcordance = self.get_query_to_df( 8550 f""" SELECT {samples_fields} FROM {table_variants} """ 8551 ) 8552 8553 # Create genotypeconcordance column 8554 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8555 dataframe_genotypeconcordance.apply( 8556 lambda row: genotypeconcordance( 8557 row, samples=self.get_header_sample_list() 8558 ), 8559 axis=1, 8560 ) 8561 ) 8562 8563 # Add genotypeconcordance to header 8564 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8565 genotypeconcordance_tag, 8566 ".", 8567 "String", 8568 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8569 "howard calculation", 8570 "0", 8571 self.code_type_map.get("String"), 8572 ) 8573 8574 # Update 8575 sql_update = f""" 8576 UPDATE variants 8577 SET "INFO" = 8578 concat( 8579 CASE 8580 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8581 THEN '' 8582 ELSE concat("INFO", ';') 8583 END, 8584 CASE 8585 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8586 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8587 THEN concat( 8588 '{genotypeconcordance_tag}=', 8589 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8590 ) 8591 ELSE '' 8592 END 8593 ) 8594 FROM dataframe_genotypeconcordance 8595 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8596 """ 8597 self.conn.execute(sql_update) 8598 8599 # Remove added columns 8600 for added_column in added_columns: 8601 self.drop_column(column=added_column) 8602 8603 # Delete dataframe 8604 del dataframe_genotypeconcordance 8605 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
8607 def calculation_barcode(self, tag: str = "barcode") -> None: 8608 """ 8609 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8610 updates the INFO field in the file with the calculated barcode values. 8611 8612 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8613 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8614 the default tag name is set to "barcode", defaults to barcode 8615 :type tag: str (optional) 8616 """ 8617 8618 # if FORMAT and samples 8619 if ( 8620 "FORMAT" in self.get_header_columns_as_list() 8621 and self.get_header_sample_list() 8622 ): 8623 8624 # barcode annotation field 8625 if not tag: 8626 tag = "barcode" 8627 8628 # VCF infos tags 8629 vcf_infos_tags = { 8630 tag: "barcode calculation (VaRank)", 8631 } 8632 8633 # Prefix 8634 prefix = self.get_explode_infos_prefix() 8635 8636 # Field 8637 barcode_infos = prefix + tag 8638 8639 # Variants table 8640 table_variants = self.get_table_variants() 8641 8642 # Header 8643 vcf_reader = self.get_header() 8644 8645 # Create variant id 8646 variant_id_column = self.get_variant_id_column() 8647 added_columns = [variant_id_column] 8648 8649 # variant_id, FORMAT and samples 8650 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8651 self.get_header_sample_list() 8652 ) 8653 8654 # Create dataframe 8655 dataframe_barcode = self.get_query_to_df( 8656 f""" SELECT {samples_fields} FROM {table_variants} """ 8657 ) 8658 8659 # Create barcode column 8660 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8661 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8662 ) 8663 8664 # Add barcode to header 8665 vcf_reader.infos[tag] = vcf.parser._Info( 8666 tag, 8667 ".", 8668 "String", 8669 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8670 "howard calculation", 8671 "0", 8672 self.code_type_map.get("String"), 8673 ) 8674 8675 # Update 8676 sql_update = f""" 8677 UPDATE {table_variants} 8678 SET "INFO" = 8679 concat( 8680 CASE 8681 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8682 THEN '' 8683 ELSE concat("INFO", ';') 8684 END, 8685 CASE 8686 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8687 AND dataframe_barcode."{barcode_infos}" NOT NULL 8688 THEN concat( 8689 '{tag}=', 8690 dataframe_barcode."{barcode_infos}" 8691 ) 8692 ELSE '' 8693 END 8694 ) 8695 FROM dataframe_barcode 8696 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8697 """ 8698 self.conn.execute(sql_update) 8699 8700 # Remove added columns 8701 for added_column in added_columns: 8702 self.drop_column(column=added_column) 8703 8704 # Delete dataframe 8705 del dataframe_barcode 8706 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcodefunction is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
8708 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8709 """ 8710 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8711 and updates the INFO field in the file with the calculated barcode values. 8712 8713 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8714 the barcode tag that will be added to the VCF file during the calculation process. If no value 8715 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8716 :type tag: str (optional) 8717 """ 8718 8719 # if FORMAT and samples 8720 if ( 8721 "FORMAT" in self.get_header_columns_as_list() 8722 and self.get_header_sample_list() 8723 ): 8724 8725 # barcode annotation field 8726 if not tag: 8727 tag = "BCF" 8728 8729 # VCF infos tags 8730 vcf_infos_tags = { 8731 tag: "barcode family calculation", 8732 f"{tag}S": "barcode family samples", 8733 } 8734 8735 # Param 8736 param = self.get_param() 8737 log.debug(f"param={param}") 8738 8739 # Prefix 8740 prefix = self.get_explode_infos_prefix() 8741 8742 # PED param 8743 ped = ( 8744 param.get("calculation", {}) 8745 .get("calculations", {}) 8746 .get("BARCODEFAMILY", {}) 8747 .get("family_pedigree", None) 8748 ) 8749 log.debug(f"ped={ped}") 8750 8751 # Load PED 8752 if ped: 8753 8754 # Pedigree is a file 8755 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8756 log.debug("Pedigree is file") 8757 with open(full_path(ped)) as ped: 8758 ped = json.load(ped) 8759 8760 # Pedigree is a string 8761 elif isinstance(ped, str): 8762 log.debug("Pedigree is str") 8763 try: 8764 ped = json.loads(ped) 8765 log.debug("Pedigree is json str") 8766 except ValueError as e: 8767 ped_samples = ped.split(",") 8768 ped = {} 8769 for ped_sample in ped_samples: 8770 ped[ped_sample] = ped_sample 8771 8772 # Pedigree is a dict 8773 elif isinstance(ped, dict): 8774 log.debug("Pedigree is dict") 8775 8776 # Pedigree is not well formatted 8777 else: 8778 msg_error = "Pedigree not well formatted" 8779 log.error(msg_error) 8780 raise ValueError(msg_error) 8781 8782 # Construct list 8783 ped_samples = list(ped.values()) 8784 8785 else: 8786 log.debug("Pedigree not defined. Take all samples") 8787 ped_samples = self.get_header_sample_list() 8788 ped = {} 8789 for ped_sample in ped_samples: 8790 ped[ped_sample] = ped_sample 8791 8792 # Check pedigree 8793 if not ped or len(ped) == 0: 8794 msg_error = f"Error in pedigree: samples {ped_samples}" 8795 log.error(msg_error) 8796 raise ValueError(msg_error) 8797 8798 # Log 8799 log.info( 8800 "Calculation 'BARCODEFAMILY' - Samples: " 8801 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8802 ) 8803 log.debug(f"ped_samples={ped_samples}") 8804 8805 # Field 8806 barcode_infos = prefix + tag 8807 8808 # Variants table 8809 table_variants = self.get_table_variants() 8810 8811 # Header 8812 vcf_reader = self.get_header() 8813 8814 # Create variant id 8815 variant_id_column = self.get_variant_id_column() 8816 added_columns = [variant_id_column] 8817 8818 # variant_id, FORMAT and samples 8819 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8820 ped_samples 8821 ) 8822 8823 # Create dataframe 8824 dataframe_barcode = self.get_query_to_df( 8825 f""" SELECT {samples_fields} FROM {table_variants} """ 8826 ) 8827 8828 # Create barcode column 8829 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8830 lambda row: barcode(row, samples=ped_samples), axis=1 8831 ) 8832 8833 # Add barcode family to header 8834 # Add vaf_normalization to header 8835 vcf_reader.formats[tag] = vcf.parser._Format( 8836 id=tag, 8837 num=".", 8838 type="String", 8839 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8840 type_code=self.code_type_map.get("String"), 8841 ) 8842 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8843 id=f"{tag}S", 8844 num=".", 8845 type="String", 8846 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8847 type_code=self.code_type_map.get("String"), 8848 ) 8849 8850 # Update 8851 # for sample in ped_samples: 8852 sql_update_set = [] 8853 for sample in self.get_header_sample_list() + ["FORMAT"]: 8854 if sample in ped_samples: 8855 value = f'dataframe_barcode."{barcode_infos}"' 8856 value_samples = "'" + ",".join(ped_samples) + "'" 8857 elif sample == "FORMAT": 8858 value = f"'{tag}'" 8859 value_samples = f"'{tag}S'" 8860 else: 8861 value = "'.'" 8862 value_samples = "'.'" 8863 format_regex = r"[a-zA-Z0-9\s]" 8864 sql_update_set.append( 8865 f""" 8866 "{sample}" = 8867 concat( 8868 CASE 8869 WHEN {table_variants}."{sample}" = './.' 8870 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8871 ELSE {table_variants}."{sample}" 8872 END, 8873 ':', 8874 {value}, 8875 ':', 8876 {value_samples} 8877 ) 8878 """ 8879 ) 8880 8881 sql_update_set_join = ", ".join(sql_update_set) 8882 sql_update = f""" 8883 UPDATE {table_variants} 8884 SET {sql_update_set_join} 8885 FROM dataframe_barcode 8886 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8887 """ 8888 self.conn.execute(sql_update) 8889 8890 # Remove added columns 8891 for added_column in added_columns: 8892 self.drop_column(column=added_column) 8893 8894 # Delete dataframe 8895 del dataframe_barcode 8896 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
8898 def calculation_trio(self) -> None: 8899 """ 8900 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8901 information to the INFO field of each variant. 8902 """ 8903 8904 # if FORMAT and samples 8905 if ( 8906 "FORMAT" in self.get_header_columns_as_list() 8907 and self.get_header_sample_list() 8908 ): 8909 8910 # trio annotation field 8911 trio_tag = "trio" 8912 8913 # VCF infos tags 8914 vcf_infos_tags = { 8915 "trio": "trio calculation", 8916 } 8917 8918 # Param 8919 param = self.get_param() 8920 8921 # Prefix 8922 prefix = self.get_explode_infos_prefix() 8923 8924 # Trio param 8925 trio_ped = ( 8926 param.get("calculation", {}) 8927 .get("calculations", {}) 8928 .get("TRIO", {}) 8929 .get("trio_pedigree", None) 8930 ) 8931 8932 # Load trio 8933 if trio_ped: 8934 8935 # Trio pedigree is a file 8936 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8937 log.debug("TRIO pedigree is file") 8938 with open(full_path(trio_ped)) as trio_ped: 8939 trio_ped = json.load(trio_ped) 8940 8941 # Trio pedigree is a string 8942 elif isinstance(trio_ped, str): 8943 log.debug("TRIO pedigree is str") 8944 try: 8945 trio_ped = json.loads(trio_ped) 8946 log.debug("TRIO pedigree is json str") 8947 except ValueError as e: 8948 trio_samples = trio_ped.split(",") 8949 if len(trio_samples) == 3: 8950 trio_ped = { 8951 "father": trio_samples[0], 8952 "mother": trio_samples[1], 8953 "child": trio_samples[2], 8954 } 8955 log.debug("TRIO pedigree is list str") 8956 else: 8957 msg_error = "TRIO pedigree not well formatted" 8958 log.error(msg_error) 8959 raise ValueError(msg_error) 8960 8961 # Trio pedigree is a dict 8962 elif isinstance(trio_ped, dict): 8963 log.debug("TRIO pedigree is dict") 8964 8965 # Trio pedigree is not well formatted 8966 else: 8967 msg_error = "TRIO pedigree not well formatted" 8968 log.error(msg_error) 8969 raise ValueError(msg_error) 8970 8971 # Construct trio list 8972 trio_samples = [ 8973 trio_ped.get("father", ""), 8974 trio_ped.get("mother", ""), 8975 trio_ped.get("child", ""), 8976 ] 8977 8978 else: 8979 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8980 samples_list = self.get_header_sample_list() 8981 if len(samples_list) >= 3: 8982 trio_samples = self.get_header_sample_list()[0:3] 8983 trio_ped = { 8984 "father": trio_samples[0], 8985 "mother": trio_samples[1], 8986 "child": trio_samples[2], 8987 } 8988 else: 8989 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8990 log.error(msg_error) 8991 raise ValueError(msg_error) 8992 8993 # Check trio pedigree 8994 if not trio_ped or len(trio_ped) != 3: 8995 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8996 log.error(msg_error) 8997 raise ValueError(msg_error) 8998 8999 # Log 9000 log.info( 9001 f"Calculation 'TRIO' - Samples: " 9002 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9003 ) 9004 9005 # Field 9006 trio_infos = prefix + trio_tag 9007 9008 # Variants table 9009 table_variants = self.get_table_variants() 9010 9011 # Header 9012 vcf_reader = self.get_header() 9013 9014 # Create variant id 9015 variant_id_column = self.get_variant_id_column() 9016 added_columns = [variant_id_column] 9017 9018 # variant_id, FORMAT and samples 9019 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9020 self.get_header_sample_list() 9021 ) 9022 9023 # Create dataframe 9024 dataframe_trio = self.get_query_to_df( 9025 f""" SELECT {samples_fields} FROM {table_variants} """ 9026 ) 9027 9028 # Create trio column 9029 dataframe_trio[trio_infos] = dataframe_trio.apply( 9030 lambda row: trio(row, samples=trio_samples), axis=1 9031 ) 9032 9033 # Add trio to header 9034 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9035 trio_tag, 9036 ".", 9037 "String", 9038 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9039 "howard calculation", 9040 "0", 9041 self.code_type_map.get("String"), 9042 ) 9043 9044 # Update 9045 sql_update = f""" 9046 UPDATE {table_variants} 9047 SET "INFO" = 9048 concat( 9049 CASE 9050 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9051 THEN '' 9052 ELSE concat("INFO", ';') 9053 END, 9054 CASE 9055 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9056 AND dataframe_trio."{trio_infos}" NOT NULL 9057 THEN concat( 9058 '{trio_tag}=', 9059 dataframe_trio."{trio_infos}" 9060 ) 9061 ELSE '' 9062 END 9063 ) 9064 FROM dataframe_trio 9065 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9066 """ 9067 self.conn.execute(sql_update) 9068 9069 # Remove added columns 9070 for added_column in added_columns: 9071 self.drop_column(column=added_column) 9072 9073 # Delete dataframe 9074 del dataframe_trio 9075 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
9077 def calculation_vaf_normalization(self) -> None: 9078 """ 9079 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9080 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9081 :return: The function does not return anything. 9082 """ 9083 9084 # if FORMAT and samples 9085 if ( 9086 "FORMAT" in self.get_header_columns_as_list() 9087 and self.get_header_sample_list() 9088 ): 9089 9090 # vaf_normalization annotation field 9091 vaf_normalization_tag = "VAF" 9092 9093 # VCF infos tags 9094 vcf_infos_tags = { 9095 "VAF": "VAF Variant Frequency", 9096 } 9097 9098 # Prefix 9099 prefix = self.get_explode_infos_prefix() 9100 9101 # Variants table 9102 table_variants = self.get_table_variants() 9103 9104 # Header 9105 vcf_reader = self.get_header() 9106 9107 # Do not calculate if VAF already exists 9108 if "VAF" in vcf_reader.formats: 9109 log.debug("VAF already on genotypes") 9110 return 9111 9112 # Create variant id 9113 variant_id_column = self.get_variant_id_column() 9114 added_columns = [variant_id_column] 9115 9116 # variant_id, FORMAT and samples 9117 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9118 f""" "{sample}" """ for sample in self.get_header_sample_list() 9119 ) 9120 9121 # Create dataframe 9122 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9123 log.debug(f"query={query}") 9124 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9125 9126 vaf_normalization_set = [] 9127 9128 # for each sample vaf_normalization 9129 for sample in self.get_header_sample_list(): 9130 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9131 lambda row: vaf_normalization(row, sample=sample), axis=1 9132 ) 9133 vaf_normalization_set.append( 9134 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9135 ) 9136 9137 # Add VAF to FORMAT 9138 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9139 "FORMAT" 9140 ].apply(lambda x: str(x) + ":VAF") 9141 vaf_normalization_set.append( 9142 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9143 ) 9144 9145 # Add vaf_normalization to header 9146 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9147 id=vaf_normalization_tag, 9148 num="1", 9149 type="Float", 9150 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9151 type_code=self.code_type_map.get("Float"), 9152 ) 9153 9154 # Create fields to add in INFO 9155 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9156 9157 # Update 9158 sql_update = f""" 9159 UPDATE {table_variants} 9160 SET {sql_vaf_normalization_set} 9161 FROM dataframe_vaf_normalization 9162 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9163 9164 """ 9165 self.conn.execute(sql_update) 9166 9167 # Remove added columns 9168 for added_column in added_columns: 9169 self.drop_column(column=added_column) 9170 9171 # Delete dataframe 9172 del dataframe_vaf_normalization 9173 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
9175 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9176 """ 9177 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9178 field in a VCF file and updates the INFO column of the variants table with the calculated 9179 statistics. 9180 9181 :param info: The `info` parameter is a string that represents the type of information for which 9182 genotype statistics are calculated. It is used to generate various VCF info tags for the 9183 statistics, such as the number of occurrences, the list of values, the minimum value, the 9184 maximum value, the mean, the median, defaults to VAF 9185 :type info: str (optional) 9186 """ 9187 9188 # if FORMAT and samples 9189 if ( 9190 "FORMAT" in self.get_header_columns_as_list() 9191 and self.get_header_sample_list() 9192 ): 9193 9194 # vaf_stats annotation field 9195 vaf_stats_tag = info + "_stats" 9196 9197 # VCF infos tags 9198 vcf_infos_tags = { 9199 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9200 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9201 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9202 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9203 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9204 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9205 info 9206 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9207 } 9208 9209 # Prefix 9210 prefix = self.get_explode_infos_prefix() 9211 9212 # Field 9213 vaf_stats_infos = prefix + vaf_stats_tag 9214 9215 # Variants table 9216 table_variants = self.get_table_variants() 9217 9218 # Header 9219 vcf_reader = self.get_header() 9220 9221 # Create variant id 9222 variant_id_column = self.get_variant_id_column() 9223 added_columns = [variant_id_column] 9224 9225 # variant_id, FORMAT and samples 9226 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9227 self.get_header_sample_list() 9228 ) 9229 9230 # Create dataframe 9231 dataframe_vaf_stats = self.get_query_to_df( 9232 f""" SELECT {samples_fields} FROM {table_variants} """ 9233 ) 9234 9235 # Create vaf_stats column 9236 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9237 lambda row: genotype_stats( 9238 row, samples=self.get_header_sample_list(), info=info 9239 ), 9240 axis=1, 9241 ) 9242 9243 # List of vcf tags 9244 sql_vaf_stats_fields = [] 9245 9246 # Check all VAF stats infos 9247 for stat in vcf_infos_tags: 9248 9249 # Extract stats 9250 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9251 lambda x: dict(x).get(stat, "") 9252 ) 9253 9254 # Add snpeff_hgvs to header 9255 vcf_reader.infos[stat] = vcf.parser._Info( 9256 stat, 9257 ".", 9258 "String", 9259 vcf_infos_tags.get(stat, "genotype statistics"), 9260 "howard calculation", 9261 "0", 9262 self.code_type_map.get("String"), 9263 ) 9264 9265 if len(sql_vaf_stats_fields): 9266 sep = ";" 9267 else: 9268 sep = "" 9269 9270 # Create fields to add in INFO 9271 sql_vaf_stats_fields.append( 9272 f""" 9273 CASE 9274 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9275 THEN concat( 9276 '{sep}{stat}=', 9277 dataframe_vaf_stats."{stat}" 9278 ) 9279 ELSE '' 9280 END 9281 """ 9282 ) 9283 9284 # SQL set for update 9285 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9286 9287 # Update 9288 sql_update = f""" 9289 UPDATE {table_variants} 9290 SET "INFO" = 9291 concat( 9292 CASE 9293 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9294 THEN '' 9295 ELSE concat("INFO", ';') 9296 END, 9297 {sql_vaf_stats_fields_set} 9298 ) 9299 FROM dataframe_vaf_stats 9300 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9301 9302 """ 9303 self.conn.execute(sql_update) 9304 9305 # Remove added columns 9306 for added_column in added_columns: 9307 self.drop_column(column=added_column) 9308 9309 # Delete dataframe 9310 del dataframe_vaf_stats 9311 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
9313 def calculation_transcripts_annotation( 9314 self, info_json: str = None, info_format: str = None 9315 ) -> None: 9316 """ 9317 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9318 field to it if transcripts are available. 9319 9320 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9321 is a string parameter that represents the information field to be used in the transcripts JSON. 9322 It is used to specify the JSON format for the transcripts information. If no value is provided 9323 when calling the method, it defaults to " 9324 :type info_json: str 9325 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9326 method is a string parameter that specifies the format of the information field to be used in 9327 the transcripts JSON. It is used to define the format of the information field 9328 :type info_format: str 9329 """ 9330 9331 # Create transcripts table 9332 transcripts_table = self.create_transcript_view() 9333 9334 # Add info field 9335 if transcripts_table: 9336 self.transcript_view_to_variants( 9337 transcripts_table=transcripts_table, 9338 transcripts_info_field_json=info_json, 9339 transcripts_info_field_format=info_format, 9340 ) 9341 else: 9342 log.info("No Transcripts to process. Check param.json file configuration")
The calculation_transcripts_annotation function creates a transcripts table and adds an info
field to it if transcripts are available.
Parameters
- info_json: The
info_jsonparameter in thecalculation_transcripts_annotationmethod is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to " - info_format: The
info_formatparameter in thecalculation_transcripts_annotationmethod is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
9344 def calculation_transcripts_prioritization(self) -> None: 9345 """ 9346 The function `calculation_transcripts_prioritization` creates a transcripts table and 9347 prioritizes transcripts based on certain criteria. 9348 """ 9349 9350 # Create transcripts table 9351 transcripts_table = self.create_transcript_view() 9352 9353 # Add info field 9354 if transcripts_table: 9355 self.transcripts_prioritization(transcripts_table=transcripts_table) 9356 else: 9357 log.info("No Transcripts to process. Check param.json file configuration")
The function calculation_transcripts_prioritization creates a transcripts table and
prioritizes transcripts based on certain criteria.
9363 def transcripts_prioritization( 9364 self, transcripts_table: str = None, param: dict = {} 9365 ) -> bool: 9366 """ 9367 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9368 and updates the variants table with the prioritized information. 9369 9370 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9371 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9372 This parameter is used to identify the table where the transcripts data is stored for the 9373 prioritization process 9374 :type transcripts_table: str 9375 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9376 that contains various configuration settings for the prioritization process of transcripts. It 9377 is used to customize the behavior of the prioritization algorithm and includes settings such as 9378 the prefix for prioritization fields, default profiles, and other 9379 :type param: dict 9380 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9381 transcripts prioritization process is successfully completed, and `False` if there are any 9382 issues or if no profile is defined for transcripts prioritization. 9383 """ 9384 9385 log.debug("Start transcripts prioritization...") 9386 9387 # Param 9388 if not param: 9389 param = self.get_param() 9390 9391 # Variants table 9392 table_variants = self.get_table_variants() 9393 log.debug(f"transcripts_table={transcripts_table}") 9394 # Transcripts table 9395 if transcripts_table is None: 9396 log.debug(f"transcripts_table={transcripts_table}") 9397 transcripts_table = self.create_transcript_view( 9398 transcripts_table="transcripts", param=param 9399 ) 9400 log.debug(f"transcripts_table={transcripts_table}") 9401 if transcripts_table is None: 9402 msg_err = "No Transcripts table availalble" 9403 log.error(msg_err) 9404 raise ValueError(msg_err) 9405 9406 # Get transcripts columns 9407 columns_as_list_query = f""" 9408 DESCRIBE {transcripts_table} 9409 """ 9410 columns_as_list = list( 9411 self.get_query_to_df(columns_as_list_query)["column_name"] 9412 ) 9413 9414 # Create INFO if not exists 9415 if "INFO" not in columns_as_list: 9416 query_add_info = f""" 9417 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9418 """ 9419 self.execute_query(query_add_info) 9420 9421 # Prioritization param and Force only PZ Score and Flag 9422 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9423 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9424 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9425 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9426 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9427 pz_profile_default = ( 9428 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9429 ) 9430 9431 # Exit if no profile 9432 if pz_profile_default is None: 9433 log.warning("No profile defined for transcripts prioritization") 9434 return False 9435 9436 # Prioritization 9437 prioritization_result = self.prioritization( 9438 table=transcripts_table, 9439 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9440 ) 9441 if not prioritization_result: 9442 log.warning("Transcripts prioritization not processed") 9443 return False 9444 9445 # Explode PZ fields 9446 self.explode_infos( 9447 table=transcripts_table, 9448 fields=param.get("transcripts", {}) 9449 .get("prioritization", {}) 9450 .get("pzfields", []), 9451 ) 9452 9453 # Export Transcripts prioritization infos to variants table 9454 query_update = f""" 9455 WITH RankedTranscripts AS ( 9456 SELECT 9457 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9458 ROW_NUMBER() OVER ( 9459 PARTITION BY "#CHROM", POS, REF, ALT 9460 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9461 ) AS rn 9462 FROM 9463 {transcripts_table} 9464 ) 9465 UPDATE {table_variants} 9466 SET 9467 INFO = CONCAT(CASE 9468 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9469 THEN '' 9470 ELSE concat("INFO", ';') 9471 END, 9472 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9473 ) 9474 FROM 9475 RankedTranscripts 9476 WHERE 9477 rn = 1 9478 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9479 AND variants."POS" = RankedTranscripts."POS" 9480 AND variants."REF" = RankedTranscripts."REF" 9481 AND variants."ALT" = RankedTranscripts."ALT" 9482 9483 """ 9484 self.execute_query(query=query_update) 9485 9486 # Add PZ Transcript in header 9487 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9488 pz_fields_transcripts, 9489 ".", 9490 "String", 9491 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9492 "unknown", 9493 "unknown", 9494 code_type_map["String"], 9495 ) 9496 9497 # Return 9498 return True
The transcripts_prioritization function prioritizes transcripts based on certain parameters
and updates the variants table with the prioritized information.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process - param: The
paramparameter in thetranscripts_prioritizationmethod is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns
The function
transcripts_prioritizationreturns a boolean valueTrueif the transcripts prioritization process is successfully completed, andFalseif there are any issues or if no profile is defined for transcripts prioritization.
9500 def create_transcript_view_from_columns_map( 9501 self, 9502 transcripts_table: str = "transcripts", 9503 columns_maps: dict = {}, 9504 added_columns: list = [], 9505 temporary_tables: list = None, 9506 annotation_fields: list = None, 9507 ) -> tuple[list, list, list]: 9508 """ 9509 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9510 specified columns mapping for transcripts data. 9511 9512 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9513 the table where the transcripts data is stored or will be stored in the database. This table 9514 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9515 predictions, etc. It defaults to "transcripts, defaults to transcripts 9516 :type transcripts_table: str (optional) 9517 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9518 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9519 represents a mapping configuration for a specific set of columns. It typically includes details such 9520 as the main transcript column and additional information columns 9521 :type columns_maps: dict 9522 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9523 function is a list that stores the additional columns that will be added to the view being created 9524 based on the columns map provided. These columns are generated by exploding the transcript 9525 information columns along with the main transcript column 9526 :type added_columns: list 9527 :param temporary_tables: The `temporary_tables` parameter in the 9528 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9529 tables created during the process of creating a transcript view from a columns map. These temporary 9530 tables are used to store intermediate results or transformations before the final view is generated 9531 :type temporary_tables: list 9532 :param annotation_fields: The `annotation_fields` parameter in the 9533 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9534 for annotation in the query view creation process. These fields are extracted from the 9535 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9536 :type annotation_fields: list 9537 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9538 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9539 """ 9540 9541 log.debug("Start transcrpts view creation from columns map...") 9542 9543 # "from_columns_map": [ 9544 # { 9545 # "transcripts_column": "Ensembl_transcriptid", 9546 # "transcripts_infos_columns": [ 9547 # "genename", 9548 # "Ensembl_geneid", 9549 # "LIST_S2_score", 9550 # "LIST_S2_pred", 9551 # ], 9552 # }, 9553 # { 9554 # "transcripts_column": "Ensembl_transcriptid", 9555 # "transcripts_infos_columns": [ 9556 # "genename", 9557 # "VARITY_R_score", 9558 # "Aloft_pred", 9559 # ], 9560 # }, 9561 # ], 9562 9563 # Init 9564 if temporary_tables is None: 9565 temporary_tables = [] 9566 if annotation_fields is None: 9567 annotation_fields = [] 9568 9569 # Variants table 9570 table_variants = self.get_table_variants() 9571 9572 for columns_map in columns_maps: 9573 9574 # Transcript column 9575 transcripts_column = columns_map.get("transcripts_column", None) 9576 9577 # Transcripts infos columns 9578 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9579 9580 if transcripts_column is not None: 9581 9582 # Explode 9583 added_columns += self.explode_infos( 9584 fields=[transcripts_column] + transcripts_infos_columns 9585 ) 9586 9587 # View clauses 9588 clause_select = [] 9589 for field in [transcripts_column] + transcripts_infos_columns: 9590 clause_select.append( 9591 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9592 ) 9593 if field not in [transcripts_column]: 9594 annotation_fields.append(field) 9595 9596 # Querey View 9597 query = f""" 9598 SELECT 9599 "#CHROM", POS, REF, ALT, 9600 "{transcripts_column}" AS 'transcript', 9601 {", ".join(clause_select)} 9602 FROM ( 9603 SELECT 9604 "#CHROM", POS, REF, ALT, 9605 {", ".join(clause_select)} 9606 FROM {table_variants} 9607 ) 9608 WHERE "{transcripts_column}" IS NOT NULL 9609 """ 9610 9611 # Create temporary table 9612 temporary_table = transcripts_table + "".join( 9613 random.choices(string.ascii_uppercase + string.digits, k=10) 9614 ) 9615 9616 # Temporary_tables 9617 temporary_tables.append(temporary_table) 9618 query_view = f""" 9619 CREATE TEMPORARY TABLE {temporary_table} 9620 AS ({query}) 9621 """ 9622 self.execute_query(query=query_view) 9623 9624 return added_columns, temporary_tables, annotation_fields
The create_transcript_view_from_columns_map function generates a temporary table view based on
specified columns mapping for transcripts data.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts - columns_maps: The
columns_mapsparameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in thecolumns_mapslist represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns - added_columns: The
added_columnsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from thetranscripts_columnandtranscripts_infos_columnsspecified in the `columns
Returns
The function
create_transcript_view_from_columns_mapreturns a tuple containing three lists:added_columns,temporary_tables, andannotation_fields.
9626 def create_transcript_view_from_column_format( 9627 self, 9628 transcripts_table: str = "transcripts", 9629 column_formats: dict = {}, 9630 temporary_tables: list = None, 9631 annotation_fields: list = None, 9632 ) -> tuple[list, list, list]: 9633 """ 9634 The `create_transcript_view_from_column_format` function generates a transcript view based on 9635 specified column formats, adds additional columns and annotation fields, and returns the list of 9636 temporary tables and annotation fields. 9637 9638 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9639 the table containing the transcripts data. This table will be used as the base table for creating 9640 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9641 different table name if needed, defaults to transcripts 9642 :type transcripts_table: str (optional) 9643 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9644 about the columns to be used for creating the transcript view. Each entry in the dictionary 9645 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9646 the provided code snippet: 9647 :type column_formats: dict 9648 :param temporary_tables: The `temporary_tables` parameter in the 9649 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9650 views created during the process of creating a transcript view from a column format. These temporary 9651 views are used to manipulate and extract data before generating the final transcript view. It 9652 :type temporary_tables: list 9653 :param annotation_fields: The `annotation_fields` parameter in the 9654 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9655 that are extracted from the temporary views created during the process. These annotation fields are 9656 obtained by querying the temporary views and extracting the column names excluding specific columns 9657 like `#CH 9658 :type annotation_fields: list 9659 :return: The `create_transcript_view_from_column_format` function returns two lists: 9660 `temporary_tables` and `annotation_fields`. 9661 """ 9662 9663 log.debug("Start transcrpts view creation from column format...") 9664 9665 # "from_column_format": [ 9666 # { 9667 # "transcripts_column": "ANN", 9668 # "transcripts_infos_column": "Feature_ID", 9669 # } 9670 # ], 9671 9672 # Init 9673 if temporary_tables is None: 9674 temporary_tables = [] 9675 if annotation_fields is None: 9676 annotation_fields = [] 9677 9678 for column_format in column_formats: 9679 9680 # annotation field and transcript annotation field 9681 annotation_field = column_format.get("transcripts_column", "ANN") 9682 transcript_annotation = column_format.get( 9683 "transcripts_infos_column", "Feature_ID" 9684 ) 9685 9686 # Temporary View name 9687 temporary_view_name = transcripts_table + "".join( 9688 random.choices(string.ascii_uppercase + string.digits, k=10) 9689 ) 9690 9691 # Create temporary view name 9692 temporary_view_name = self.annotation_format_to_table( 9693 uniquify=True, 9694 annotation_field=annotation_field, 9695 view_name=temporary_view_name, 9696 annotation_id=transcript_annotation, 9697 ) 9698 9699 # Annotation fields 9700 if temporary_view_name: 9701 query_annotation_fields = f""" 9702 SELECT * 9703 FROM ( 9704 DESCRIBE SELECT * 9705 FROM {temporary_view_name} 9706 ) 9707 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9708 """ 9709 df_annotation_fields = self.get_query_to_df( 9710 query=query_annotation_fields 9711 ) 9712 9713 # Add temporary view and annotation fields 9714 temporary_tables.append(temporary_view_name) 9715 annotation_fields += list(set(df_annotation_fields["column_name"])) 9716 9717 return temporary_tables, annotation_fields
The create_transcript_view_from_column_format function generates a transcript view based on
specified column formats, adds additional columns and annotation fields, and returns the list of
temporary tables and annotation fields.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts - column_formats: The
column_formatsparameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. For example, in the provided code snippet: - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view. It - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
Returns
The
create_transcript_view_from_column_formatfunction returns two lists:temporary_tablesandannotation_fields.
9719 def create_transcript_view( 9720 self, 9721 transcripts_table: str = None, 9722 transcripts_table_drop: bool = True, 9723 param: dict = {}, 9724 ) -> str: 9725 """ 9726 The `create_transcript_view` function generates a transcript view by processing data from a 9727 specified table based on provided parameters and structural information. 9728 9729 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9730 is used to specify the name of the table that will store the final transcript view data. If a table 9731 name is not provided, the function will create a new table to store the transcript view data, and by 9732 default,, defaults to transcripts 9733 :type transcripts_table: str (optional) 9734 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9735 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9736 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9737 the function will drop the existing transcripts table if it exists, defaults to True 9738 :type transcripts_table_drop: bool (optional) 9739 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9740 contains information needed to create a transcript view. It includes details such as the structure 9741 of the transcripts, columns mapping, column formats, and other necessary information for generating 9742 the view. This parameter allows for flexibility and customization 9743 :type param: dict 9744 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9745 created or modified during the execution of the function. 9746 """ 9747 9748 log.debug("Start transcripts view creation...") 9749 9750 # Default 9751 transcripts_table_default = "transcripts" 9752 9753 # Param 9754 if not param: 9755 param = self.get_param() 9756 9757 # Struct 9758 struct = param.get("transcripts", {}).get("struct", None) 9759 9760 if struct: 9761 9762 # Transcripts table 9763 if transcripts_table is None: 9764 transcripts_table = param.get("transcripts", {}).get( 9765 "table", transcripts_table_default 9766 ) 9767 9768 # added_columns 9769 added_columns = [] 9770 9771 # Temporary tables 9772 temporary_tables = [] 9773 9774 # Annotation fields 9775 annotation_fields = [] 9776 9777 # from columns map 9778 columns_maps = struct.get("from_columns_map", []) 9779 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 9780 self.create_transcript_view_from_columns_map( 9781 transcripts_table=transcripts_table, 9782 columns_maps=columns_maps, 9783 added_columns=added_columns, 9784 temporary_tables=temporary_tables, 9785 annotation_fields=annotation_fields, 9786 ) 9787 ) 9788 added_columns += added_columns_tmp 9789 temporary_tables += temporary_tables_tmp 9790 annotation_fields += annotation_fields_tmp 9791 9792 # from column format 9793 column_formats = struct.get("from_column_format", []) 9794 temporary_tables_tmp, annotation_fields_tmp = ( 9795 self.create_transcript_view_from_column_format( 9796 transcripts_table=transcripts_table, 9797 column_formats=column_formats, 9798 temporary_tables=temporary_tables, 9799 annotation_fields=annotation_fields, 9800 ) 9801 ) 9802 temporary_tables += temporary_tables_tmp 9803 annotation_fields += annotation_fields_tmp 9804 9805 # Merge temporary tables query 9806 query_merge = "" 9807 for temporary_table in temporary_tables: 9808 9809 # First temporary table 9810 if not query_merge: 9811 query_merge = f""" 9812 SELECT * FROM {temporary_table} 9813 """ 9814 # other temporary table (using UNION) 9815 else: 9816 query_merge += f""" 9817 UNION BY NAME SELECT * FROM {temporary_table} 9818 """ 9819 9820 # Merge on transcript 9821 query_merge_on_transcripts_annotation_fields = [] 9822 # Aggregate all annotations fields 9823 for annotation_field in set(annotation_fields): 9824 query_merge_on_transcripts_annotation_fields.append( 9825 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 9826 ) 9827 # Query for transcripts view 9828 query_merge_on_transcripts = f""" 9829 SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 9830 FROM ({query_merge}) 9831 GROUP BY "#CHROM", POS, REF, ALT, transcript 9832 """ 9833 9834 # Drop transcript view is necessary 9835 if transcripts_table_drop: 9836 query_drop = f""" 9837 DROP TABLE IF EXISTS {transcripts_table}; 9838 """ 9839 self.execute_query(query=query_drop) 9840 9841 # Merge and create transcript view 9842 query_create_view = f""" 9843 CREATE TABLE IF NOT EXISTS {transcripts_table} 9844 AS {query_merge_on_transcripts} 9845 """ 9846 self.execute_query(query=query_create_view) 9847 9848 # Remove added columns 9849 for added_column in added_columns: 9850 self.drop_column(column=added_column) 9851 9852 else: 9853 9854 transcripts_table = None 9855 9856 return transcripts_table
The create_transcript_view function generates a transcript view by processing data from a
specified table based on provided parameters and structural information.
Parameters
- transcripts_table: The
transcripts_tableparameter in thecreate_transcript_viewfunction is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts - transcripts_table_drop: The
transcripts_table_dropparameter in thecreate_transcript_viewfunction is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. Iftranscripts_table_dropis set toTrue, the function will drop the existing transcripts table if it exists, defaults to True - param: The
paramparameter in thecreate_transcript_viewfunction is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns
The
create_transcript_viewfunction returns the name of the transcripts table that was created or modified during the execution of the function.
9858 def annotation_format_to_table( 9859 self, 9860 uniquify: bool = True, 9861 annotation_field: str = "ANN", 9862 annotation_id: str = "Feature_ID", 9863 view_name: str = "transcripts", 9864 ) -> str: 9865 """ 9866 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 9867 table format. 9868 9869 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 9870 values in the output or not. If set to `True`, the function will make sure that the output values 9871 are unique, defaults to True 9872 :type uniquify: bool (optional) 9873 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 9874 contains the annotation information for each variant. This field is used to extract the annotation 9875 details for further processing in the function, defaults to ANN 9876 :type annotation_field: str (optional) 9877 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 9878 used to specify the identifier for the annotation feature. This identifier will be used as a column 9879 name in the resulting table or view that is created based on the annotation data. It helps in 9880 uniquely identifying each annotation entry in the, defaults to Feature_ID 9881 :type annotation_id: str (optional) 9882 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 9883 specify the name of the temporary table that will be created to store the transformed annotation 9884 data. This table will hold the extracted information from the annotation field in a structured 9885 format for further processing or analysis, defaults to transcripts 9886 :type view_name: str (optional) 9887 :return: The function `annotation_format_to_table` is returning the name of the view created, which 9888 is stored in the variable `view_name`. 9889 """ 9890 9891 # Annotation field 9892 annotation_format = "annotation_explode" 9893 9894 # Transcript annotation 9895 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 9896 9897 # Prefix 9898 prefix = self.get_explode_infos_prefix() 9899 if prefix: 9900 prefix = "INFO/" 9901 9902 # Annotation fields 9903 annotation_infos = prefix + annotation_field 9904 annotation_format_infos = prefix + annotation_format 9905 9906 # Variants table 9907 table_variants = self.get_table_variants() 9908 9909 # Header 9910 vcf_reader = self.get_header() 9911 9912 # Add columns 9913 added_columns = [] 9914 9915 # Explode HGVS field in column 9916 added_columns += self.explode_infos(fields=[annotation_field]) 9917 9918 if annotation_field in vcf_reader.infos: 9919 9920 # Extract ANN header 9921 ann_description = vcf_reader.infos[annotation_field].desc 9922 pattern = r"'(.+?)'" 9923 match = re.search(pattern, ann_description) 9924 if match: 9925 ann_header_match = match.group(1).split(" | ") 9926 ann_header = [] 9927 ann_header_desc = {} 9928 for i in range(len(ann_header_match)): 9929 ann_header_info = "".join( 9930 char for char in ann_header_match[i] if char.isalnum() 9931 ) 9932 ann_header.append(ann_header_info) 9933 ann_header_desc[ann_header_info] = ann_header_match[i] 9934 if not ann_header_desc: 9935 raise ValueError("Invalid header description format") 9936 else: 9937 raise ValueError("Invalid header description format") 9938 9939 # Create variant id 9940 variant_id_column = self.get_variant_id_column() 9941 added_columns += [variant_id_column] 9942 9943 # Create dataframe 9944 dataframe_annotation_format = self.get_query_to_df( 9945 f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 9946 ) 9947 9948 # Create annotation columns 9949 dataframe_annotation_format[ 9950 annotation_format_infos 9951 ] = dataframe_annotation_format[annotation_infos].apply( 9952 lambda x: explode_annotation_format( 9953 annotation=str(x), 9954 uniquify=uniquify, 9955 output_format="JSON", 9956 prefix="", 9957 header=list(ann_header_desc.values()), 9958 ) 9959 ) 9960 9961 # Find keys 9962 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 9963 df_keys = self.get_query_to_df(query=query_json) 9964 9965 # Check keys 9966 query_json_key = [] 9967 for _, row in df_keys.iterrows(): 9968 9969 # Key 9970 key = row.iloc[0] 9971 9972 # key_clean 9973 key_clean = "".join(char for char in key if char.isalnum()) 9974 9975 # Type 9976 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 9977 9978 # Get DataFrame from query 9979 df_json_type = self.get_query_to_df(query=query_json_type) 9980 9981 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 9982 with pd.option_context("future.no_silent_downcasting", True): 9983 df_json_type.fillna(value="", inplace=True) 9984 replace_dict = {None: np.nan, "": np.nan} 9985 df_json_type.replace(replace_dict, inplace=True) 9986 df_json_type.dropna(inplace=True) 9987 9988 # Detect column type 9989 column_type = detect_column_type(df_json_type[key_clean]) 9990 9991 # Append 9992 query_json_key.append( 9993 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 9994 ) 9995 9996 # Create view 9997 query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));""" 9998 self.execute_query(query=query_view) 9999 10000 else: 10001 10002 # Return None 10003 view_name = None 10004 10005 # Remove added columns 10006 for added_column in added_columns: 10007 self.drop_column(column=added_column) 10008 10009 return view_name
The function annotation_format_to_table converts annotation data from a VCF file into a structured
table format.
Parameters
- uniquify: The
uniquifyparameter is a boolean flag that determines whether to ensure unique values in the output or not. If set toTrue, the function will make sure that the output values are unique, defaults to True - annotation_field: The
annotation_fieldparameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function, defaults to ANN - annotation_id: The
annotation_idparameter in theannotation_format_to_tablemethod is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID - view_name: The
view_nameparameter in theannotation_format_to_tablemethod is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis, defaults to transcripts
Returns
The function
annotation_format_to_tableis returning the name of the view created, which is stored in the variableview_name.
10011 def transcript_view_to_variants( 10012 self, 10013 transcripts_table: str = None, 10014 transcripts_column_id: str = None, 10015 transcripts_info_json: str = None, 10016 transcripts_info_field_json: str = None, 10017 transcripts_info_format: str = None, 10018 transcripts_info_field_format: str = None, 10019 param: dict = {}, 10020 ) -> bool: 10021 """ 10022 The `transcript_view_to_variants` function updates a variants table with information from 10023 transcripts in JSON format. 10024 10025 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 10026 table containing the transcripts data. If this parameter is not provided, the function will 10027 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 10028 :type transcripts_table: str 10029 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 10030 column in the `transcripts_table` that contains the unique identifier for each transcript. This 10031 identifier is used to match transcripts with variants in the database 10032 :type transcripts_column_id: str 10033 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 10034 of the column in the variants table where the transcripts information will be stored in JSON 10035 format. This parameter allows you to define the column in the variants table that will hold the 10036 JSON-formatted information about transcripts 10037 :type transcripts_info_json: str 10038 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 10039 specify the field in the VCF header that will contain information about transcripts in JSON 10040 format. This field will be added to the VCF header as an INFO field with the specified name 10041 :type transcripts_info_field_json: str 10042 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 10043 format of the information about transcripts that will be stored in the variants table. This 10044 format can be used to define how the transcript information will be structured or displayed 10045 within the variants table 10046 :type transcripts_info_format: str 10047 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 10048 specify the field in the VCF header that will contain information about transcripts in a 10049 specific format. This field will be added to the VCF header as an INFO field with the specified 10050 name 10051 :type transcripts_info_field_format: str 10052 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 10053 that contains various configuration settings related to transcripts. It is used to provide 10054 default values for certain parameters if they are not explicitly provided when calling the 10055 method. The `param` dictionary can be passed as an argument 10056 :type param: dict 10057 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 10058 if the operation is successful and `False` if certain conditions are not met. 10059 """ 10060 10061 msg_info_prefix = "Start transcripts view to variants annotations" 10062 10063 log.debug(f"{msg_info_prefix}...") 10064 10065 # Default 10066 transcripts_table_default = "transcripts" 10067 transcripts_column_id_default = "transcript" 10068 transcripts_info_json_default = None 10069 transcripts_info_format_default = None 10070 transcripts_info_field_json_default = None 10071 transcripts_info_field_format_default = None 10072 10073 # Param 10074 if not param: 10075 param = self.get_param() 10076 10077 # Transcripts table 10078 if transcripts_table is None: 10079 transcripts_table = param.get("transcripts", {}).get( 10080 "table", transcripts_table_default 10081 ) 10082 10083 # Transcripts column ID 10084 if transcripts_column_id is None: 10085 transcripts_column_id = param.get("transcripts", {}).get( 10086 "column_id", transcripts_column_id_default 10087 ) 10088 10089 # Transcripts info json 10090 if transcripts_info_json is None: 10091 transcripts_info_json = param.get("transcripts", {}).get( 10092 "transcripts_info_json", transcripts_info_json_default 10093 ) 10094 10095 # Transcripts info field JSON 10096 if transcripts_info_field_json is None: 10097 transcripts_info_field_json = param.get("transcripts", {}).get( 10098 "transcripts_info_field_json", transcripts_info_field_json_default 10099 ) 10100 # if transcripts_info_field_json is not None and transcripts_info_json is None: 10101 # transcripts_info_json = transcripts_info_field_json 10102 10103 # Transcripts info format 10104 if transcripts_info_format is None: 10105 transcripts_info_format = param.get("transcripts", {}).get( 10106 "transcripts_info_format", transcripts_info_format_default 10107 ) 10108 10109 # Transcripts info field FORMAT 10110 if transcripts_info_field_format is None: 10111 transcripts_info_field_format = param.get("transcripts", {}).get( 10112 "transcripts_info_field_format", transcripts_info_field_format_default 10113 ) 10114 # if ( 10115 # transcripts_info_field_format is not None 10116 # and transcripts_info_format is None 10117 # ): 10118 # transcripts_info_format = transcripts_info_field_format 10119 10120 # Variants table 10121 table_variants = self.get_table_variants() 10122 10123 # Check info columns param 10124 if ( 10125 transcripts_info_json is None 10126 and transcripts_info_field_json is None 10127 and transcripts_info_format is None 10128 and transcripts_info_field_format is None 10129 ): 10130 return False 10131 10132 # Transcripts infos columns 10133 query_transcripts_infos_columns = f""" 10134 SELECT * 10135 FROM ( 10136 DESCRIBE SELECT * FROM {transcripts_table} 10137 ) 10138 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10139 """ 10140 transcripts_infos_columns = list( 10141 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10142 ) 10143 10144 # View results 10145 clause_select = [] 10146 clause_to_json = [] 10147 clause_to_format = [] 10148 for field in transcripts_infos_columns: 10149 clause_select.append( 10150 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10151 ) 10152 clause_to_json.append(f""" '{field}': "{field}" """) 10153 clause_to_format.append(f""" "{field}" """) 10154 10155 # Update 10156 update_set_json = [] 10157 update_set_format = [] 10158 10159 # VCF header 10160 vcf_reader = self.get_header() 10161 10162 # Transcripts to info column in JSON 10163 if transcripts_info_json is not None: 10164 10165 # Create column on variants table 10166 self.add_column( 10167 table_name=table_variants, 10168 column_name=transcripts_info_json, 10169 column_type="JSON", 10170 default_value=None, 10171 drop=False, 10172 ) 10173 10174 # Add header 10175 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10176 transcripts_info_json, 10177 ".", 10178 "String", 10179 "Transcripts in JSON format", 10180 "unknwon", 10181 "unknwon", 10182 self.code_type_map["String"], 10183 ) 10184 10185 # Add to update 10186 update_set_json.append( 10187 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10188 ) 10189 10190 # Transcripts to info field in JSON 10191 if transcripts_info_field_json is not None: 10192 10193 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 10194 10195 # Add to update 10196 update_set_json.append( 10197 f""" 10198 INFO = concat( 10199 CASE 10200 WHEN INFO NOT IN ('', '.') 10201 THEN INFO 10202 ELSE '' 10203 END, 10204 CASE 10205 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10206 THEN concat( 10207 ';{transcripts_info_field_json}=', 10208 t.{transcripts_info_json} 10209 ) 10210 ELSE '' 10211 END 10212 ) 10213 """ 10214 ) 10215 10216 # Add header 10217 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 10218 transcripts_info_field_json, 10219 ".", 10220 "String", 10221 "Transcripts in JSON format", 10222 "unknwon", 10223 "unknwon", 10224 self.code_type_map["String"], 10225 ) 10226 10227 if update_set_json: 10228 10229 # Update query 10230 query_update = f""" 10231 UPDATE {table_variants} 10232 SET {", ".join(update_set_json)} 10233 FROM 10234 ( 10235 SELECT 10236 "#CHROM", POS, REF, ALT, 10237 concat( 10238 '{{', 10239 string_agg( 10240 '"' || "{transcripts_column_id}" || '":' || 10241 to_json(json_output) 10242 ), 10243 '}}' 10244 )::JSON AS {transcripts_info_json} 10245 FROM 10246 ( 10247 SELECT 10248 "#CHROM", POS, REF, ALT, 10249 "{transcripts_column_id}", 10250 to_json( 10251 {{{",".join(clause_to_json)}}} 10252 )::JSON AS json_output 10253 FROM 10254 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10255 WHERE "{transcripts_column_id}" IS NOT NULL 10256 ) 10257 GROUP BY "#CHROM", POS, REF, ALT 10258 ) AS t 10259 WHERE {table_variants}."#CHROM" = t."#CHROM" 10260 AND {table_variants}."POS" = t."POS" 10261 AND {table_variants}."REF" = t."REF" 10262 AND {table_variants}."ALT" = t."ALT" 10263 """ 10264 10265 self.execute_query(query=query_update) 10266 10267 # Transcripts to info column in FORMAT 10268 if transcripts_info_format is not None: 10269 10270 # Create column on variants table 10271 self.add_column( 10272 table_name=table_variants, 10273 column_name=transcripts_info_format, 10274 column_type="VARCHAR", 10275 default_value=None, 10276 drop=False, 10277 ) 10278 10279 # Add header 10280 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 10281 transcripts_info_format, 10282 ".", 10283 "String", 10284 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10285 "unknwon", 10286 "unknwon", 10287 self.code_type_map["String"], 10288 ) 10289 10290 # Add to update 10291 update_set_format.append( 10292 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 10293 ) 10294 10295 # Transcripts to info field in JSON 10296 if transcripts_info_field_format is not None: 10297 10298 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 10299 10300 # Add to update 10301 update_set_format.append( 10302 f""" 10303 INFO = concat( 10304 CASE 10305 WHEN INFO NOT IN ('', '.') 10306 THEN INFO 10307 ELSE '' 10308 END, 10309 CASE 10310 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 10311 THEN concat( 10312 ';{transcripts_info_field_format}=', 10313 t.{transcripts_info_format} 10314 ) 10315 ELSE '' 10316 END 10317 ) 10318 """ 10319 ) 10320 10321 # Add header 10322 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 10323 transcripts_info_field_format, 10324 ".", 10325 "String", 10326 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10327 "unknwon", 10328 "unknwon", 10329 self.code_type_map["String"], 10330 ) 10331 10332 if update_set_format: 10333 10334 # Update query 10335 query_update = f""" 10336 UPDATE {table_variants} 10337 SET {", ".join(update_set_format)} 10338 FROM 10339 ( 10340 SELECT 10341 "#CHROM", POS, REF, ALT, 10342 string_agg({transcripts_info_format}) AS {transcripts_info_format} 10343 FROM 10344 ( 10345 SELECT 10346 "#CHROM", POS, REF, ALT, 10347 "{transcripts_column_id}", 10348 concat( 10349 "{transcripts_column_id}", 10350 '|', 10351 {", '|', ".join(clause_to_format)} 10352 ) AS {transcripts_info_format} 10353 FROM 10354 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10355 ) 10356 GROUP BY "#CHROM", POS, REF, ALT 10357 ) AS t 10358 WHERE {table_variants}."#CHROM" = t."#CHROM" 10359 AND {table_variants}."POS" = t."POS" 10360 AND {table_variants}."REF" = t."REF" 10361 AND {table_variants}."ALT" = t."ALT" 10362 """ 10363 10364 self.execute_query(query=query_update) 10365 10366 return True
The transcript_view_to_variants function updates a variants table with information from
transcripts in JSON format.
Parameters
- transcripts_table: The
transcripts_tableparameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from theparamdictionary or use a default value of "transcripts" - transcripts_column_id: The
transcripts_column_idparameter is used to specify the column in thetranscripts_tablethat contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database - transcripts_info_json: The
transcripts_info_jsonparameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts - transcripts_info_field_json: The
transcripts_info_field_jsonparameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name - transcripts_info_format: The
transcripts_info_formatparameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table - transcripts_info_field_format: The
transcripts_info_field_formatparameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name - param: The
paramparameter in thetranscript_view_to_variantsmethod is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. Theparamdictionary can be passed as an argument
Returns
The function
transcript_view_to_variantsreturns a boolean value. It returnsTrueif the operation is successful andFalseif certain conditions are not met.